Skip to content

Commit

Permalink
Serve expired cache update fixes (#1174)
Browse files Browse the repository at this point in the history
- Fixes a regression bug with serve-expired that appeared in 1.22.0
  and would not allow the iterator to update the cache with
  not-yet-validated entries resulting in increased outgoing traffic.

- Treat serve_expired_norec_ttl as a backoff timer for failed updates of expired records.
- Try to use expired answers instead of SERVFAIL if serve-expired is
  enabled even without serve-expired-client-timeout.
- Add suggestion to refresh the cached norec_ttl and expired_ttl when a
  response cannot update the usable expired entry.
  • Loading branch information
gthess authored Dec 31, 2024
1 parent e57e537 commit fff9f62
Show file tree
Hide file tree
Showing 7 changed files with 170 additions and 46 deletions.
6 changes: 3 additions & 3 deletions daemon/worker.c
Original file line number Diff line number Diff line change
Expand Up @@ -1845,10 +1845,10 @@ worker_handle_request(struct comm_point* c, void* arg, int error,
* its qname must be that used for cache
* lookup. */
if((worker->env.cfg->prefetch &&
*worker->env.now >= rep->prefetch_ttl) ||
rep->prefetch_ttl <= *worker->env.now) ||
(worker->env.cfg->serve_expired &&
*worker->env.now > rep->ttl)) {

rep->ttl < *worker->env.now &&
!(*worker->env.now < rep->serve_expired_norec_ttl))) {
time_t leeway = rep->ttl - *worker->env.now;
if(rep->ttl < *worker->env.now)
leeway = 0;
Expand Down
41 changes: 38 additions & 3 deletions services/cache/dns.c
Original file line number Diff line number Diff line change
Expand Up @@ -1056,7 +1056,7 @@ dns_cache_lookup(struct module_env* env,

int
dns_cache_store(struct module_env* env, struct query_info* msgqinf,
struct reply_info* msgrep, int is_referral, time_t leeway, int pside,
struct reply_info* msgrep, int is_referral, time_t leeway, int pside,
struct regional* region, uint32_t flags, time_t qstarttime,
int is_valrec)
{
Expand All @@ -1066,7 +1066,7 @@ dns_cache_store(struct module_env* env, struct query_info* msgqinf,
* useful expired record exists. */
struct msgreply_entry* e = msg_cache_lookup(env,
msgqinf->qname, msgqinf->qname_len, msgqinf->qtype,
msgqinf->qclass, flags, 0, 0);
msgqinf->qclass, flags, 0, 1);
if(e) {
struct reply_info* cached = e->entry.data;
if(cached->ttl < *env->now
Expand All @@ -1081,7 +1081,42 @@ dns_cache_store(struct module_env* env, struct query_info* msgqinf,
&& cached->security != sec_status_bogus
&& (env->need_to_validate &&
msgrep->security == sec_status_unchecked)
&& !is_valrec) {
/* Exceptions to that rule are:
* o recursions that don't need validation but
* need to update the cache for coherence
* (delegation information while iterating,
* DNSKEY and DS lookups from validator)
* o explicit RRSIG queries that are not
* validated. */
&& !is_valrec
&& msgqinf->qtype != LDNS_RR_TYPE_RRSIG) {
if((int)FLAGS_GET_RCODE(msgrep->flags) !=
LDNS_RCODE_NOERROR &&
(int)FLAGS_GET_RCODE(msgrep->flags) !=
LDNS_RCODE_NXDOMAIN) {
/* The current response has an
* erroneous rcode. Adjust norec time
* so that additional lookups are not
* performed for some time. */
verbose(VERB_ALGO, "set "
"serve-expired-norec-ttl for "
"response in cache");
cached->serve_expired_norec_ttl =
NORR_TTL + *env->now;
if(env->cfg->serve_expired_ttl_reset &&
cached->serve_expired_ttl
< *env->now +
env->cfg->serve_expired_ttl) {
/* Reset serve-expired-ttl for
* valid response in cache. */
verbose(VERB_ALGO, "reset "
"serve-expired-ttl "
"for response in cache");
cached->serve_expired_ttl =
*env->now +
env->cfg->serve_expired_ttl;
}
}
verbose(VERB_ALGO, "a validated expired entry "
"could be overwritten, skip caching "
"the new message at this stage");
Expand Down
6 changes: 4 additions & 2 deletions services/mesh.c
Original file line number Diff line number Diff line change
Expand Up @@ -1512,8 +1512,10 @@ void mesh_query_done(struct mesh_state* mstate)
}
if(mstate->s.return_rcode == LDNS_RCODE_SERVFAIL ||
(rep && FLAGS_GET_RCODE(rep->flags) == LDNS_RCODE_SERVFAIL)) {
/* we are SERVFAILing; check for expired answer here */
mesh_serve_expired_callback(mstate);
if(mstate->s.env->cfg->serve_expired) {
/* we are SERVFAILing; check for expired answer here */
mesh_respond_serve_expired(mstate);
}
if((mstate->reply_list || mstate->cb_list)
&& mstate->s.env->cfg->log_servfail
&& !mstate->s.env->cfg->val_log_squelch) {
Expand Down
48 changes: 36 additions & 12 deletions testdata/serve_expired_client_timeout_servfail.rpl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ server:
ede: yes
ede-serve-expired: yes


stub-zone:
name: "example.com"
stub-addr: 1.2.3.4
Expand All @@ -25,8 +24,10 @@ SCENARIO_BEGIN Test serve-expired with client-timeout and a SERVFAIL upstream re
; - check that we get the expired cached answer
; - query again (the answer is available on the upstream server now)
; - check that we get the immediate expired answer back instead
; - (the upstream query does happen after the expired reply and updates the cache)
; - query again (the upstream has no answer)
; - query again (the answer is available on the upstream server now)
; - check that we *still* get the immediate expired answer back instead, recursion is blocked for NORR_TTL(5)
; - wait for NORR_TTL(5) to expire
; - query again
; - check that we get the freshly cached answer

; ns.example.com.
Expand Down Expand Up @@ -73,7 +74,7 @@ RANGE_BEGIN 30 40
RANGE_END

; ns.example.com.
RANGE_BEGIN 50 60
RANGE_BEGIN 50 100
ADDRESS 1.2.3.4
; response to A query
ENTRY_BEGIN
Expand Down Expand Up @@ -148,12 +149,8 @@ ENTRY_BEGIN
example.com. IN A
ENTRY_END

; Allow for upstream query to resolve.
STEP 51 TRAFFIC

; Check that we got an immediate stale answer because of the previous failure,
; regardless if upstream has the answer already in this range. The query will
; be resolved after the immediate cached answer and will cache the result.
; regardless if upstream has the answer already in this range.
STEP 60 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl ede=3
Expand All @@ -171,14 +168,41 @@ ENTRY_END
; Query again
STEP 70 QUERY
ENTRY_BEGIN
REPLY RD
REPLY RD DO
SECTION QUESTION
example.com. IN A
ENTRY_END

; Check that we got the cached updated answer from the previous step since
; there is no upstream in this range.
; Check that we still get the immediate stale answer because of the previous failure,
; regardless if upstream has the answer already in this range. NORR_TTL(5) blocks us from
; recursion.
STEP 80 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl ede=3
REPLY QR RD RA DO NOERROR
SECTION QUESTION
example.com. IN A
SECTION ANSWER
example.com. 123 IN A 5.6.7.8
SECTION AUTHORITY
example.com. 123 IN NS ns.example.com.
SECTION ADDITIONAL
ns.example.com. 123 IN A 1.2.3.4
ENTRY_END

; Let NORR_TTL(5) expire
STEP 81 TIME_PASSES ELAPSE 5

; Query again
STEP 90 QUERY
ENTRY_BEGIN
REPLY RD
SECTION QUESTION
example.com. IN A
ENTRY_END

; Check fresh reply
STEP 100 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl
REPLY QR RD RA NOERROR
Expand Down
18 changes: 10 additions & 8 deletions testdata/serve_expired_client_timeout_val_bogus.rpl
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ SCENARIO_BEGIN Test serve-expired with client-timeout and bogus answer
; - wait for the record to expire
; - (upstream now has a bogus response)
; - query again for www.example.com. IN A
; - check that we get the expired valid response instead
; - query once more
; - check that we get the expired valid response instead; recursion is blocked for NORR_TTL(5) because of the failure
; - (upstream has the valid response again)
; - query once more
; - check that we get the immediate expired valid response
; - (the prefetch query updates the cache with the valid response)
; - let NORR_TTL(5) expire
; - query one last time
; - check that we get the immediate valid cache response; upstream does not have an answer at this moment
; - check that we get the immediate valid cache response

; The example.com NS and ns.example.com A record are commented out.
; This to make the test succeed. It then keeps the dnssec valid lookup.
Expand Down Expand Up @@ -199,7 +199,7 @@ RANGE_END
;;
;; ns.example.com. with valid data again
;;
RANGE_BEGIN 40 60
RANGE_BEGIN 40 70
ADDRESS 1.2.3.4
; response to query of interest
ENTRY_BEGIN
Expand Down Expand Up @@ -279,7 +279,7 @@ SECTION QUESTION
www.example.com. IN A
ENTRY_END

; immediate cached answer because upstream is valid again
; immediate cached answer; although upstream is valid again
STEP 50 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl ede=3
Expand All @@ -297,15 +297,17 @@ SECTION ADDITIONAL
;ns.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCQMyTjn7WWwpwAR1LlVeLpRgZGuQIUCcJDEkwAuzytTDRlYK7nIMwH1CM= ;{id = 2854}
ENTRY_END

; upstream query is resolved before this query comes in
STEP 51 TIME_PASSES ELAPSE 5

; query one last time
STEP 60 QUERY
ENTRY_BEGIN
REPLY RD DO
SECTION QUESTION
www.example.com. IN A
ENTRY_END

; prefetch query updated the cache, since there is no upstream response in this range
; this is the fresh valid response
STEP 70 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl
Expand Down
22 changes: 10 additions & 12 deletions testdata/serve_expired_ttl_reset.rpl
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ SCENARIO_BEGIN Serve expired ttl with reset on forwarder with a timeout on upstr
; - Wait for it to expire (+ serve-expired-ttl)
; - Send query again
; - Upstream timeouts
; - Error response from iterator SERVFAIL, resets expired-ttl on cache
; - Check we are getting the SERVFAIL response
; - Error response from iterator SERVFAIL, resets expired-ttl on cache and sets norec_ttl blocking recursion
; - Check we are getting the cached response because it was expired-ttl-reset
; - Query again
; - Check we are getting the expired answer
; - Upstream still timeouts
; - Check we are getting the expired answer; prefetching is blocked by norec_ttl
; - If there was prefetching the test would fail with the pending upstream query

STEP 1 QUERY
ENTRY_BEGIN
Expand Down Expand Up @@ -54,7 +54,7 @@ STEP 4 TIME_PASSES ELAPSE 12

STEP 5 QUERY
ENTRY_BEGIN
REPLY RD
REPLY RD DO
SECTION QUESTION
www.example.com. IN A
ENTRY_END
Expand All @@ -67,14 +67,16 @@ STEP 8 TIMEOUT
STEP 9 TIMEOUT
STEP 10 TIMEOUT

; Returns servfail
; Returns
; but error response from iterator resets the expired ttl
STEP 11 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl
REPLY QR RA RD SERVFAIL
MATCH all ttl ede=3
REPLY QR RA RD DO NOERROR
SECTION QUESTION
www.example.com. IN A
SECTION ANSWER
www.example.com. 123 IN A 0.0.0.0
ENTRY_END

; Query again
Expand All @@ -96,8 +98,4 @@ SECTION ANSWER
www.example.com. 123 IN A 0.0.0.0
ENTRY_END

; But the pending query times out!
; Only one because RTT reached the limit.
STEP 16 TIMEOUT

SCENARIO_END
Loading

0 comments on commit fff9f62

Please sign in to comment.