From 6cb9dd8364938925b177ef58ca2722e10f9e7fea Mon Sep 17 00:00:00 2001 From: Shiroko Date: Tue, 6 Aug 2024 17:41:43 +0800 Subject: [PATCH 1/4] feat(request-debugging): Finer time resolution and latency in total Using `time_ns` to measure the context duration and introduced a field `total_time_without_upstream` to represent total latency without upstream. KAG-4733 --- ...ger-finer-resolution-and-total-latency.yml | 7 ++++ kong/timing/context.lua | 37 ++++++++++++++++--- 2 files changed, 39 insertions(+), 5 deletions(-) create mode 100644 changelog/unreleased/kong/feat-request-debguger-finer-resolution-and-total-latency.yml diff --git a/changelog/unreleased/kong/feat-request-debguger-finer-resolution-and-total-latency.yml b/changelog/unreleased/kong/feat-request-debguger-finer-resolution-and-total-latency.yml new file mode 100644 index 000000000000..ae866b6c68d2 --- /dev/null +++ b/changelog/unreleased/kong/feat-request-debguger-finer-resolution-and-total-latency.yml @@ -0,0 +1,7 @@ +message: | + Improved the output of the request debugger: + - Now the resolution of field `total_time` is microseconds. + - A new field `total_time_without_upstream` on the top level shows the latency only introduced by Kong. +type: feature +scope: Core + diff --git a/kong/timing/context.lua b/kong/timing/context.lua index 2df5f5ec52bf..e15634276456 100644 --- a/kong/timing/context.lua +++ b/kong/timing/context.lua @@ -8,7 +8,7 @@ local setmetatable = setmetatable local table_insert = table.insert local table_remove = table.remove -local get_cur_msec = require("kong.tools.time").get_updated_monotonic_ms +local time_ns = require("kong.tools.time").time_ns local assert = assert @@ -29,14 +29,13 @@ function _M:enter_subcontext(name) end self.current_subcontext = self.current_subcontext.child[name] - self.current_subcontext.____start____ = get_cur_msec() + self.current_subcontext.____start____ = time_ns() end function _M:leave_subcontext(attributes) assert(#self.sub_context_stack > 0, "subcontext stack underflow") - - local elapsed = get_cur_msec() - self.current_subcontext.____start____ + local elapsed = (time_ns() - self.current_subcontext.____start____) / 1e6 local old_total_time = self.current_subcontext.total_time or 0 self.current_subcontext.total_time = old_total_time + elapsed self.current_subcontext.____start____ = nil @@ -75,11 +74,37 @@ function _M:set_root_context_prop(k, v) end +function _M:finalize(subcontext) + -- finalize total_time optionally rounding to the nearest integer + if subcontext.total_time then + -- round to 2 decimal places + subcontext.total_time = math_floor(subcontext.total_time * 100) / 100 + end + + if subcontext.child then + for _, child in pairs(subcontext.child) do + self:finalize(child) + end + end +end + + +function _M:get_total_time_without_upstream() + local total_time = 0 + for k, child in pairs(self.root_context.child) do + if k ~= "upstream" and child.total_time then + total_time = total_time + child.total_time + end + end + return total_time +end + + function _M:to_json() local dangling = nil -- `> 1` means we have at least one subcontext (the root context) - -- We always call this function at then end of the header_filter and + -- We always call this function at then end of the header_filter and -- log phases, so we should always have at least one subcontext. while #self.sub_context_stack > 1 do self:set_context_prop("dangling", true) @@ -92,6 +117,8 @@ function _M:to_json() end self:set_root_context_prop("dangling", dangling) + self:finalize(self.root_context) + self.root_context.total_time_without_upstream = self:get_total_time_without_upstream() return assert(cjson.encode(self.root_context)) end From da73b5e4a35c67f1efe1411ecaf6ea7e6e8adca2 Mon Sep 17 00:00:00 2001 From: Shiroko Date: Wed, 18 Sep 2024 18:09:56 +0800 Subject: [PATCH 2/4] feat(request-debugging): add tests for float number --- .../01-request-debug_spec.lua | 205 +++++++++++------- 1 file changed, 128 insertions(+), 77 deletions(-) diff --git a/spec/02-integration/21-request-debug/01-request-debug_spec.lua b/spec/02-integration/21-request-debug/01-request-debug_spec.lua index 13d626f474cd..518e05c42224 100644 --- a/spec/02-integration/21-request-debug/01-request-debug_spec.lua +++ b/spec/02-integration/21-request-debug/01-request-debug_spec.lua @@ -287,6 +287,26 @@ local function assert_plugin_has_span(plugin_span, span_name) return true end +local function assert_total_time_is_float(subcontext, current_path) + local current_path = current_path or '$' + + if subcontext.total_time then + -- body filter is mostly zero + if subcontext.total_time ~= 0 then + assert.are_not.equal(subcontext.total_time % 1, 0, current_path .. ".total_time is not a float number"); + end + end + + if subcontext.child then + for path, child in pairs(subcontext.child) do + -- Upstream time is measured by nginx and it's always decimal rather than float + if path ~= "upstream" then + assert_total_time_is_float(child, current_path .. "." .. path) + end + end + end +end + local function start_kong(strategy, deployment, disable_req_dbg, token) local request_debug = nil @@ -500,22 +520,28 @@ describe(desc, function() helpers.wait_for_all_config_update() - local header_output = assert_has_output_header(deployment, "/slow-streaming", "*") - local log_output = assert_has_output_log(deployment, "/slow-streaming", "*") + helpers.pwait_until(function() + local header_output = assert_has_output_header(deployment, "/slow-streaming", "*") + local log_output = assert_has_output_log(deployment, "/slow-streaming", "*") + + local total_header = assert(tonumber(header_output.child.upstream.total_time)) + local tfb_header = assert(tonumber(header_output.child.upstream.child.time_to_first_byte.total_time)) + assert.falsy(header_output.child.upstream.child.streaming) + assert.same(total_header, tfb_header) + + local total_log = assert(tonumber(log_output.child.upstream.total_time)) + local tfb_log = assert(tonumber(log_output.child.upstream.child.time_to_first_byte.total_time)) + local streaming = assert(tonumber(log_output.child.upstream.child.streaming.total_time)) - local total_header = assert(tonumber(header_output.child.upstream.total_time)) - local tfb_header = assert(tonumber(header_output.child.upstream.child.time_to_first_byte.total_time)) - assert.falsy(header_output.child.upstream.child.streaming) - assert.same(total_header, tfb_header) + assert_total_time_is_float(header_output) + assert_total_time_is_float(log_output) - local total_log = assert(tonumber(log_output.child.upstream.total_time)) - local tfb_log = assert(tonumber(log_output.child.upstream.child.time_to_first_byte.total_time)) - local streaming = assert(tonumber(log_output.child.upstream.child.streaming.total_time)) - assert.near(tfb_header, tfb_log, 50) - assert.same(total_log, tfb_log + streaming) + assert.near(tfb_header, tfb_log, 50) + assert.same(total_log, tfb_log + streaming) - assert.near(TIME_TO_FIRST_BYTE, tfb_log, 50) - assert.near(STREAMING, streaming, 50) + assert.near(TIME_TO_FIRST_BYTE, tfb_log, 50) + assert.near(STREAMING, streaming, 50) + end, 10) end) it("rewrite, access, balancer, header_filter, body_filter, log, plugin span, dns span", function() @@ -529,26 +555,31 @@ describe(desc, function() helpers.wait_for_all_config_update() - local header_output = assert_has_output_header(deployment, "/mutiple-spans", "*") - local log_output = assert_has_output_log(deployment, "/mutiple-spans", "*") - - assert.truthy(header_output.child.rewrite) - assert.truthy(header_output.child.access) - assert.truthy(header_output.child.access.child.dns) -- upstream is resolved in access phase - assert.truthy(header_output.child.access.child.router) -- router is executed in access phase - assert(header_output.child.access.child.dns.child.localhost.child.resolve.cache_hit ~= nil, "dns cache hit should be recorded") - assert.truthy(header_output.child.balancer) - assert.truthy(header_output.child.header_filter) - - assert.truthy(log_output.child.rewrite) - assert.truthy(log_output.child.access) - assert.truthy(log_output.child.access.child.dns) -- upstream is resolved in access phase - assert.truthy(log_output.child.access.child.router) -- router is executed in access phase - assert(log_output.child.access.child.dns.child.localhost.child.resolve.cache_hit ~= nil, "dns cache hit should be recorded") - assert.truthy(log_output.child.balancer) - assert.truthy(log_output.child.header_filter) - assert.truthy(log_output.child.body_filter) - assert.truthy(log_output.child.log) + helpers.pwait_until(function() + local header_output = assert_has_output_header(deployment, "/mutiple-spans", "*") + local log_output = assert_has_output_log(deployment, "/mutiple-spans", "*") + + assert.truthy(header_output.child.rewrite) + assert.truthy(header_output.child.access) + assert.truthy(header_output.child.access.child.dns) -- upstream is resolved in access phase + assert.truthy(header_output.child.access.child.router) -- router is executed in access phase + assert(header_output.child.access.child.dns.child.localhost.child.resolve.cache_hit ~= nil, "dns cache hit should be recorded") + assert.truthy(header_output.child.balancer) + assert.truthy(header_output.child.header_filter) + + assert.truthy(log_output.child.rewrite) + assert.truthy(log_output.child.access) + assert.truthy(log_output.child.access.child.dns) -- upstream is resolved in access phase + assert.truthy(log_output.child.access.child.router) -- router is executed in access phase + assert(log_output.child.access.child.dns.child.localhost.child.resolve.cache_hit ~= nil, "dns cache hit should be recorded") + assert.truthy(log_output.child.balancer) + assert.truthy(log_output.child.header_filter) + assert.truthy(log_output.child.body_filter) + assert.truthy(log_output.child.log) + + assert_total_time_is_float(header_output) + assert_total_time_is_float(log_output) + end, 10) end) it("subrequests involved", function() @@ -568,34 +599,39 @@ describe(desc, function() helpers.wait_for_all_config_update() - local header_output = assert_has_output_header(deployment, "/subrequests", "*") - local log_output = assert_has_output_log(deployment, "/subrequests", "*") - - -- spans of main request - assert.truthy(header_output.child.rewrite) - assert.truthy(header_output.child.access) - assert.truthy(header_output.child.access.child.dns) -- upstream is resolved in access phase - assert.truthy(header_output.child.access.child.router) -- router is executed in access phase - assert.truthy(header_output.child.response) - - assert.truthy(log_output.child.rewrite) - assert.truthy(log_output.child.access) - assert.truthy(log_output.child.access.child.dns) -- upstream is resolved in access phase - assert.truthy(header_output.child.access.child.router) -- router is executed in access phase - assert.truthy(log_output.child.body_filter) - assert.truthy(log_output.child.log) - - -- spans of subrequest - assert.truthy(header_output.child.response.child.balancer) - assert.truthy(header_output.child.response.child.header_filter) - assert.truthy(header_output.child.response.child.plugins) - assert.truthy(header_output.child.response.child.plugins.child["enable-buffering-response"]) - - assert.truthy(log_output.child.response.child.balancer) - assert.truthy(log_output.child.response.child.header_filter) - assert.truthy(log_output.child.response.child.body_filter) - assert.truthy(log_output.child.response.child.plugins) - assert.truthy(log_output.child.response.child.plugins.child["enable-buffering-response"]) + helpers.pwait_until(function() + local header_output = assert_has_output_header(deployment, "/subrequests", "*") + local log_output = assert_has_output_log(deployment, "/subrequests", "*") + + -- spans of main request + assert.truthy(header_output.child.rewrite) + assert.truthy(header_output.child.access) + assert.truthy(header_output.child.access.child.dns) -- upstream is resolved in access phase + assert.truthy(header_output.child.access.child.router) -- router is executed in access phase + assert.truthy(header_output.child.response) + + assert.truthy(log_output.child.rewrite) + assert.truthy(log_output.child.access) + assert.truthy(log_output.child.access.child.dns) -- upstream is resolved in access phase + assert.truthy(header_output.child.access.child.router) -- router is executed in access phase + assert.truthy(log_output.child.body_filter) + assert.truthy(log_output.child.log) + + -- spans of subrequest + assert.truthy(header_output.child.response.child.balancer) + assert.truthy(header_output.child.response.child.header_filter) + assert.truthy(header_output.child.response.child.plugins) + assert.truthy(header_output.child.response.child.plugins.child["enable-buffering-response"]) + + assert.truthy(log_output.child.response.child.balancer) + assert.truthy(log_output.child.response.child.header_filter) + assert.truthy(log_output.child.response.child.body_filter) + assert.truthy(log_output.child.response.child.plugins) + assert.truthy(log_output.child.response.child.plugins.child["enable-buffering-response"]) + + assert_total_time_is_float(header_output) + assert_total_time_is_float(log_output) + end, 10) end) it("external_http span", function() @@ -614,14 +650,19 @@ describe(desc, function() helpers.wait_for_all_config_update() - local header_output = assert_has_output_header(deployment, "/external_http", "*") - local log_output = assert_has_output_log(deployment, "/external_http", "*") + helpers.pwait_until(function() + local header_output = assert_has_output_header(deployment, "/external_http", "*") + local log_output = assert_has_output_log(deployment, "/external_http", "*") - local plugin_span = assert.truthy(header_output.child.access.child.plugins.child["muti-external-http-calls"].child) - assert_plugin_has_span(plugin_span, "external_http") + local plugin_span = assert.truthy(header_output.child.access.child.plugins.child["muti-external-http-calls"].child) + assert_plugin_has_span(plugin_span, "external_http") - plugin_span = assert.truthy(log_output.child.access.child.plugins.child["muti-external-http-calls"].child) - assert_plugin_has_span(plugin_span, "external_http") + plugin_span = assert.truthy(log_output.child.access.child.plugins.child["muti-external-http-calls"].child) + assert_plugin_has_span(plugin_span, "external_http") + + assert_total_time_is_float(header_output) + assert_total_time_is_float(log_output) + end, 10) end) it("redis span", function() @@ -649,14 +690,19 @@ describe(desc, function() helpers.wait_for_all_config_update() - local header_output = assert_has_output_header(deployment, "/redis", "*") - local log_output = assert_has_output_log(deployment, "/redis", "*") + helpers.pwait_until(function() + local header_output = assert_has_output_header(deployment, "/redis", "*") + local log_output = assert_has_output_log(deployment, "/redis", "*") + + local plugin_span = assert.truthy(header_output.child.access.child.plugins.child["rate-limiting"].child) + assert_plugin_has_span(plugin_span, "redis") - local plugin_span = assert.truthy(header_output.child.access.child.plugins.child["rate-limiting"].child) - assert_plugin_has_span(plugin_span, "redis") + plugin_span = assert.truthy(log_output.child.access.child.plugins.child["rate-limiting"].child) + assert_plugin_has_span(plugin_span, "redis") - plugin_span = assert.truthy(log_output.child.access.child.plugins.child["rate-limiting"].child) - assert_plugin_has_span(plugin_span, "redis") + assert_total_time_is_float(header_output) + assert_total_time_is_float(log_output) + end, 10) end) it("truncate/split too large debug output", function() @@ -675,11 +721,16 @@ describe(desc, function() helpers.wait_for_all_config_update() - local header_output = assert_has_output_header(deployment, "/large_debug_output", "*", "1.1.1.1", TOKEN) - local _, truncated = assert_has_output_log(deployment, "/large_debug_output", "*", "1.1.1.1", TOKEN) + helpers.pwait_until(function() + local header_output = assert_has_output_header(deployment, "/large_debug_output", "*", "1.1.1.1", TOKEN) + local _, truncated = assert_has_output_log(deployment, "/large_debug_output", "*", "1.1.1.1", TOKEN) + + assert.truthy(header_output.truncated) + assert.truthy(truncated) - assert.truthy(header_output.truncated) - assert.truthy(truncated) + assert_total_time_is_float(header_output) + assert_total_time_is_float(truncated) + end, 10) end) it("invalid X-Kong-Request-Debug request header should not trigger this feature", function() From 9854a64b1a7dc6396e2200a75cf5edbe18eec46c Mon Sep 17 00:00:00 2001 From: Shiroko Date: Mon, 23 Sep 2024 13:20:53 +0800 Subject: [PATCH 3/4] feat(request-debugging): set cjson precision to avoid floating number error --- kong/timing/context.lua | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kong/timing/context.lua b/kong/timing/context.lua index e15634276456..0d3b4f5c88f4 100644 --- a/kong/timing/context.lua +++ b/kong/timing/context.lua @@ -1,4 +1,4 @@ -local cjson = require("cjson.safe") +local cjson = require("cjson.safe").new() local ngx_get_phase = ngx.get_phase local ngx_re_gmatch = ngx.re.gmatch @@ -15,6 +15,8 @@ local assert = assert local _M = {} local _MT = { __index = _M } +-- Set number precision smaller than 16 to avoid floating point errors +cjson.encode_number_precision(14) function _M:enter_subcontext(name) assert(name ~= nil, "name is required") From 4b4acd224a9ff9c2b4704cf0b3d465eee0eb72c2 Mon Sep 17 00:00:00 2001 From: Shiroko Date: Mon, 23 Sep 2024 15:27:57 +0800 Subject: [PATCH 4/4] feat(request-debugging): fix a bug when no child is present --- kong/timing/context.lua | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kong/timing/context.lua b/kong/timing/context.lua index 0d3b4f5c88f4..f05ed29a0518 100644 --- a/kong/timing/context.lua +++ b/kong/timing/context.lua @@ -120,7 +120,9 @@ function _M:to_json() self:set_root_context_prop("dangling", dangling) self:finalize(self.root_context) - self.root_context.total_time_without_upstream = self:get_total_time_without_upstream() + if self.root_context.child ~= nil then + self.root_context.total_time_without_upstream = self:get_total_time_without_upstream() + end return assert(cjson.encode(self.root_context)) end