Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(request-debugging): Finer time resolution and latency in total #13460

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
message: |
Improved the output of the request debugger:
- Now the resolution of field `total_time` is microseconds.
- A new field `total_time_without_upstream` on the top level shows the latency only introduced by Kong.
type: feature
scope: Core

43 changes: 37 additions & 6 deletions kong/timing/context.lua
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
local cjson = require("cjson.safe")
local cjson = require("cjson.safe").new()

local ngx_get_phase = ngx.get_phase
local ngx_re_gmatch = ngx.re.gmatch
Expand All @@ -8,13 +8,15 @@ local setmetatable = setmetatable
local table_insert = table.insert
local table_remove = table.remove

local get_cur_msec = require("kong.tools.time").get_updated_monotonic_ms
local time_ns = require("kong.tools.time").time_ns

local assert = assert

local _M = {}
local _MT = { __index = _M }

-- Set number precision smaller than 16 to avoid floating point errors
cjson.encode_number_precision(14)

function _M:enter_subcontext(name)
assert(name ~= nil, "name is required")
Expand All @@ -29,14 +31,13 @@ function _M:enter_subcontext(name)
end

self.current_subcontext = self.current_subcontext.child[name]
self.current_subcontext.____start____ = get_cur_msec()
self.current_subcontext.____start____ = time_ns()
end


function _M:leave_subcontext(attributes)
assert(#self.sub_context_stack > 0, "subcontext stack underflow")

local elapsed = get_cur_msec() - self.current_subcontext.____start____
local elapsed = (time_ns() - self.current_subcontext.____start____) / 1e6
local old_total_time = self.current_subcontext.total_time or 0
self.current_subcontext.total_time = old_total_time + elapsed
self.current_subcontext.____start____ = nil
Expand Down Expand Up @@ -75,11 +76,37 @@ function _M:set_root_context_prop(k, v)
end


function _M:finalize(subcontext)
-- finalize total_time optionally rounding to the nearest integer
Oyami-Srk marked this conversation as resolved.
Show resolved Hide resolved
if subcontext.total_time then
-- round to 2 decimal places
subcontext.total_time = math_floor(subcontext.total_time * 100) / 100
end

if subcontext.child then
for _, child in pairs(subcontext.child) do
self:finalize(child)
end
end
end


function _M:get_total_time_without_upstream()
local total_time = 0
for k, child in pairs(self.root_context.child) do
if k ~= "upstream" and child.total_time then
total_time = total_time + child.total_time
end
end
return total_time
end


function _M:to_json()
local dangling = nil

-- `> 1` means we have at least one subcontext (the root context)
-- We always call this function at then end of the header_filter and
-- We always call this function at then end of the header_filter and
-- log phases, so we should always have at least one subcontext.
while #self.sub_context_stack > 1 do
self:set_context_prop("dangling", true)
Expand All @@ -92,6 +119,10 @@ function _M:to_json()
end

self:set_root_context_prop("dangling", dangling)
self:finalize(self.root_context)
if self.root_context.child ~= nil then
self.root_context.total_time_without_upstream = self:get_total_time_without_upstream()
end
return assert(cjson.encode(self.root_context))
end

Expand Down
205 changes: 128 additions & 77 deletions spec/02-integration/21-request-debug/01-request-debug_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,26 @@ local function assert_plugin_has_span(plugin_span, span_name)
return true
end

local function assert_total_time_is_float(subcontext, current_path)
local current_path = current_path or '$'

if subcontext.total_time then
-- body filter is mostly zero
if subcontext.total_time ~= 0 then
assert.are_not.equal(subcontext.total_time % 1, 0, current_path .. ".total_time is not a float number");
end
end

if subcontext.child then
for path, child in pairs(subcontext.child) do
-- Upstream time is measured by nginx and it's always decimal rather than float
if path ~= "upstream" then
assert_total_time_is_float(child, current_path .. "." .. path)
end
end
end
end


local function start_kong(strategy, deployment, disable_req_dbg, token)
local request_debug = nil
Expand Down Expand Up @@ -500,22 +520,28 @@ describe(desc, function()

helpers.wait_for_all_config_update()

local header_output = assert_has_output_header(deployment, "/slow-streaming", "*")
local log_output = assert_has_output_log(deployment, "/slow-streaming", "*")
helpers.pwait_until(function()
local header_output = assert_has_output_header(deployment, "/slow-streaming", "*")
local log_output = assert_has_output_log(deployment, "/slow-streaming", "*")

local total_header = assert(tonumber(header_output.child.upstream.total_time))
local tfb_header = assert(tonumber(header_output.child.upstream.child.time_to_first_byte.total_time))
assert.falsy(header_output.child.upstream.child.streaming)
assert.same(total_header, tfb_header)

local total_log = assert(tonumber(log_output.child.upstream.total_time))
local tfb_log = assert(tonumber(log_output.child.upstream.child.time_to_first_byte.total_time))
local streaming = assert(tonumber(log_output.child.upstream.child.streaming.total_time))

local total_header = assert(tonumber(header_output.child.upstream.total_time))
local tfb_header = assert(tonumber(header_output.child.upstream.child.time_to_first_byte.total_time))
assert.falsy(header_output.child.upstream.child.streaming)
assert.same(total_header, tfb_header)
assert_total_time_is_float(header_output)
assert_total_time_is_float(log_output)

local total_log = assert(tonumber(log_output.child.upstream.total_time))
local tfb_log = assert(tonumber(log_output.child.upstream.child.time_to_first_byte.total_time))
local streaming = assert(tonumber(log_output.child.upstream.child.streaming.total_time))
assert.near(tfb_header, tfb_log, 50)
assert.same(total_log, tfb_log + streaming)
assert.near(tfb_header, tfb_log, 50)
assert.same(total_log, tfb_log + streaming)

assert.near(TIME_TO_FIRST_BYTE, tfb_log, 50)
assert.near(STREAMING, streaming, 50)
assert.near(TIME_TO_FIRST_BYTE, tfb_log, 50)
assert.near(STREAMING, streaming, 50)
end, 10)
end)

it("rewrite, access, balancer, header_filter, body_filter, log, plugin span, dns span", function()
Expand All @@ -529,26 +555,31 @@ describe(desc, function()

helpers.wait_for_all_config_update()

local header_output = assert_has_output_header(deployment, "/mutiple-spans", "*")
local log_output = assert_has_output_log(deployment, "/mutiple-spans", "*")

assert.truthy(header_output.child.rewrite)
assert.truthy(header_output.child.access)
assert.truthy(header_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(header_output.child.access.child.router) -- router is executed in access phase
assert(header_output.child.access.child.dns.child.localhost.child.resolve.cache_hit ~= nil, "dns cache hit should be recorded")
assert.truthy(header_output.child.balancer)
assert.truthy(header_output.child.header_filter)

assert.truthy(log_output.child.rewrite)
assert.truthy(log_output.child.access)
assert.truthy(log_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(log_output.child.access.child.router) -- router is executed in access phase
assert(log_output.child.access.child.dns.child.localhost.child.resolve.cache_hit ~= nil, "dns cache hit should be recorded")
assert.truthy(log_output.child.balancer)
assert.truthy(log_output.child.header_filter)
assert.truthy(log_output.child.body_filter)
assert.truthy(log_output.child.log)
helpers.pwait_until(function()
local header_output = assert_has_output_header(deployment, "/mutiple-spans", "*")
local log_output = assert_has_output_log(deployment, "/mutiple-spans", "*")

assert.truthy(header_output.child.rewrite)
assert.truthy(header_output.child.access)
assert.truthy(header_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(header_output.child.access.child.router) -- router is executed in access phase
assert(header_output.child.access.child.dns.child.localhost.child.resolve.cache_hit ~= nil, "dns cache hit should be recorded")
assert.truthy(header_output.child.balancer)
assert.truthy(header_output.child.header_filter)

assert.truthy(log_output.child.rewrite)
assert.truthy(log_output.child.access)
assert.truthy(log_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(log_output.child.access.child.router) -- router is executed in access phase
assert(log_output.child.access.child.dns.child.localhost.child.resolve.cache_hit ~= nil, "dns cache hit should be recorded")
assert.truthy(log_output.child.balancer)
assert.truthy(log_output.child.header_filter)
assert.truthy(log_output.child.body_filter)
assert.truthy(log_output.child.log)

assert_total_time_is_float(header_output)
assert_total_time_is_float(log_output)
end, 10)
end)

it("subrequests involved", function()
Expand All @@ -568,34 +599,39 @@ describe(desc, function()

helpers.wait_for_all_config_update()

local header_output = assert_has_output_header(deployment, "/subrequests", "*")
local log_output = assert_has_output_log(deployment, "/subrequests", "*")

-- spans of main request
assert.truthy(header_output.child.rewrite)
assert.truthy(header_output.child.access)
assert.truthy(header_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(header_output.child.access.child.router) -- router is executed in access phase
assert.truthy(header_output.child.response)

assert.truthy(log_output.child.rewrite)
assert.truthy(log_output.child.access)
assert.truthy(log_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(header_output.child.access.child.router) -- router is executed in access phase
assert.truthy(log_output.child.body_filter)
assert.truthy(log_output.child.log)

-- spans of subrequest
assert.truthy(header_output.child.response.child.balancer)
assert.truthy(header_output.child.response.child.header_filter)
assert.truthy(header_output.child.response.child.plugins)
assert.truthy(header_output.child.response.child.plugins.child["enable-buffering-response"])

assert.truthy(log_output.child.response.child.balancer)
assert.truthy(log_output.child.response.child.header_filter)
assert.truthy(log_output.child.response.child.body_filter)
assert.truthy(log_output.child.response.child.plugins)
assert.truthy(log_output.child.response.child.plugins.child["enable-buffering-response"])
helpers.pwait_until(function()
local header_output = assert_has_output_header(deployment, "/subrequests", "*")
local log_output = assert_has_output_log(deployment, "/subrequests", "*")

-- spans of main request
assert.truthy(header_output.child.rewrite)
assert.truthy(header_output.child.access)
assert.truthy(header_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(header_output.child.access.child.router) -- router is executed in access phase
assert.truthy(header_output.child.response)

assert.truthy(log_output.child.rewrite)
assert.truthy(log_output.child.access)
assert.truthy(log_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(header_output.child.access.child.router) -- router is executed in access phase
assert.truthy(log_output.child.body_filter)
assert.truthy(log_output.child.log)

-- spans of subrequest
assert.truthy(header_output.child.response.child.balancer)
assert.truthy(header_output.child.response.child.header_filter)
assert.truthy(header_output.child.response.child.plugins)
assert.truthy(header_output.child.response.child.plugins.child["enable-buffering-response"])

assert.truthy(log_output.child.response.child.balancer)
assert.truthy(log_output.child.response.child.header_filter)
assert.truthy(log_output.child.response.child.body_filter)
assert.truthy(log_output.child.response.child.plugins)
assert.truthy(log_output.child.response.child.plugins.child["enable-buffering-response"])

assert_total_time_is_float(header_output)
assert_total_time_is_float(log_output)
end, 10)
end)

it("external_http span", function()
Expand All @@ -614,14 +650,19 @@ describe(desc, function()

helpers.wait_for_all_config_update()

local header_output = assert_has_output_header(deployment, "/external_http", "*")
local log_output = assert_has_output_log(deployment, "/external_http", "*")
helpers.pwait_until(function()
local header_output = assert_has_output_header(deployment, "/external_http", "*")
local log_output = assert_has_output_log(deployment, "/external_http", "*")

local plugin_span = assert.truthy(header_output.child.access.child.plugins.child["muti-external-http-calls"].child)
assert_plugin_has_span(plugin_span, "external_http")
local plugin_span = assert.truthy(header_output.child.access.child.plugins.child["muti-external-http-calls"].child)
assert_plugin_has_span(plugin_span, "external_http")

plugin_span = assert.truthy(log_output.child.access.child.plugins.child["muti-external-http-calls"].child)
assert_plugin_has_span(plugin_span, "external_http")
plugin_span = assert.truthy(log_output.child.access.child.plugins.child["muti-external-http-calls"].child)
assert_plugin_has_span(plugin_span, "external_http")

assert_total_time_is_float(header_output)
assert_total_time_is_float(log_output)
end, 10)
end)

it("redis span", function()
Expand Down Expand Up @@ -649,14 +690,19 @@ describe(desc, function()

helpers.wait_for_all_config_update()

local header_output = assert_has_output_header(deployment, "/redis", "*")
local log_output = assert_has_output_log(deployment, "/redis", "*")
helpers.pwait_until(function()
local header_output = assert_has_output_header(deployment, "/redis", "*")
local log_output = assert_has_output_log(deployment, "/redis", "*")

local plugin_span = assert.truthy(header_output.child.access.child.plugins.child["rate-limiting"].child)
assert_plugin_has_span(plugin_span, "redis")

local plugin_span = assert.truthy(header_output.child.access.child.plugins.child["rate-limiting"].child)
assert_plugin_has_span(plugin_span, "redis")
plugin_span = assert.truthy(log_output.child.access.child.plugins.child["rate-limiting"].child)
assert_plugin_has_span(plugin_span, "redis")

plugin_span = assert.truthy(log_output.child.access.child.plugins.child["rate-limiting"].child)
assert_plugin_has_span(plugin_span, "redis")
assert_total_time_is_float(header_output)
assert_total_time_is_float(log_output)
end, 10)
end)

it("truncate/split too large debug output", function()
Expand All @@ -675,11 +721,16 @@ describe(desc, function()

helpers.wait_for_all_config_update()

local header_output = assert_has_output_header(deployment, "/large_debug_output", "*", "1.1.1.1", TOKEN)
local _, truncated = assert_has_output_log(deployment, "/large_debug_output", "*", "1.1.1.1", TOKEN)
helpers.pwait_until(function()
local header_output = assert_has_output_header(deployment, "/large_debug_output", "*", "1.1.1.1", TOKEN)
local _, truncated = assert_has_output_log(deployment, "/large_debug_output", "*", "1.1.1.1", TOKEN)

assert.truthy(header_output.truncated)
assert.truthy(truncated)

assert.truthy(header_output.truncated)
assert.truthy(truncated)
assert_total_time_is_float(header_output)
assert_total_time_is_float(truncated)
end, 10)
end)

it("invalid X-Kong-Request-Debug request header should not trigger this feature", function()
Expand Down
Loading