diff --git a/daemon/cachedump.c b/daemon/cachedump.c index 43c1a9a23..85fe9f839 100644 --- a/daemon/cachedump.c +++ b/daemon/cachedump.c @@ -802,8 +802,7 @@ print_dp_details(SSL* ssl, struct worker* worker, struct delegpt* dp) { char buf[257]; struct delegpt_addr* a; - int lame, dlame, rlame, rto, edns_vs, to; - int entry_ttl; + int lame, dlame, rlame, rto, edns_vs, to, delay, entry_ttl; struct rtt_info ri; uint8_t edns_lame_known; for(a = dp->target_list; a; a = a->next_target) { @@ -816,7 +815,7 @@ print_dp_details(SSL* ssl, struct worker* worker, struct delegpt* dp) } /* lookup in infra cache */ entry_ttl = infra_get_host_rto(worker->env.infra_cache, - &a->addr, a->addrlen, &ri, *worker->env.now); + &a->addr, a->addrlen, &ri, &delay, *worker->env.now); if(entry_ttl == -1) { if(!ssl_printf(ssl, "not in infra cache.\n")) return; @@ -840,6 +839,9 @@ print_dp_details(SSL* ssl, struct worker* worker, struct delegpt* dp) rlame?"NoAuthButRecursive ":"", rto, entry_ttl, ri.srtt, ri.rttvar, rtt_notimeout(&ri))) return; + if(delay) + if(!ssl_printf(ssl, ", probedelay %d", delay)) + return; if(infra_host(worker->env.infra_cache, &a->addr, a->addrlen, *worker->env.now, &edns_vs, &edns_lame_known, &to)) { if(edns_vs == -1) { diff --git a/daemon/remote.c b/daemon/remote.c index 66e89650a..f78e1d632 100644 --- a/daemon/remote.c +++ b/daemon/remote.c @@ -1572,10 +1572,11 @@ dump_infra_host(struct lruhash_entry* e, void* arg) return; } if(!ssl_printf(a->ssl, "%s ttl %d ping %d var %d rtt %d rto %d " - "ednsknown %d edns %d\n", + "ednsknown %d edns %d delay %d\n", ip_str, (int)(d->ttl - a->now), d->rtt.srtt, d->rtt.rttvar, rtt_notimeout(&d->rtt), d->rtt.rto, - (int)d->edns_lame_known, (int)d->edns_version)) + (int)d->edns_lame_known, (int)d->edns_version, + (int)(a->nowprobedelay?d->probedelay-a->now:0))) return; if(d->lameness) lruhash_traverse(d->lameness, 0, &dump_infra_lame, arg); diff --git a/doc/Changelog b/doc/Changelog index 2e4e6d002..a94467cc1 100644 --- a/doc/Changelog +++ b/doc/Changelog @@ -1,6 +1,10 @@ 26 October 2010: Wouter - dump_infra and flush_infra commands for unbound-control. - no timeout backoff if meanwhile a query succeeded. + - Change of timeout code. No more lost and backoff in blockage. + At 12sec timeout (and at least 2x lost before) one probe per IP + is allowed only. At 120sec, the IP is blocked. After 15min, a + 120sec entry has a single retry packet. 25 October 2010: Wouter - Configure errors if ldns is not found. diff --git a/services/cache/infra.c b/services/cache/infra.c index 4ac51f69b..9e1e3a81c 100644 --- a/services/cache/infra.c +++ b/services/cache/infra.c @@ -49,6 +49,9 @@ #include "util/config_file.h" #include "iterator/iterator.h" +/** Timeout when only a single probe query per IP is allowed. */ +#define PROBE_MAXRTO 12000 /* in msec */ + size_t infra_host_sizefunc(void* k, void* ATTR_UNUSED(d)) { @@ -213,6 +216,7 @@ host_entry_init(struct infra_cache* infra, struct lruhash_entry* e, rtt_init(&data->rtt); data->edns_version = 0; data->edns_lame_known = 0; + data->probedelay = 0; } /** @@ -257,6 +261,7 @@ infra_host(struct infra_cache* infra, struct sockaddr_storage* addr, struct lruhash_entry* e = infra_lookup_host_nottl(infra, addr, addrlen, 0); struct infra_host_data* data; + int wr = 0; if(e && ((struct infra_host_data*)e->data)->ttl < timenow) { /* it expired, try to reuse existing entry */ lock_rw_unlock(&e->lock); @@ -266,6 +271,7 @@ infra_host(struct infra_cache* infra, struct sockaddr_storage* addr, /* re-initialise */ /* do not touch lameness, it may be valid still */ host_entry_init(infra, e, timenow); + wr = 1; } } if(!e) { @@ -284,6 +290,22 @@ infra_host(struct infra_cache* infra, struct sockaddr_storage* addr, *to = rtt_timeout(&data->rtt); *edns_vs = data->edns_version; *edns_lame_known = data->edns_lame_known; + if(*to >= PROBE_MAXRTO && rtt_notimeout(&data->rtt)*4 <= *to) { + /* delay other queries, this is the probe query */ + if(!wr) { + lock_rw_unlock(&e->lock); + e = infra_lookup_host_nottl(infra, addr, addrlen, 1); + if(!e) { /* flushed from cache real fast, no use to + allocate just for the probedelay */ + return 1; + } + data = (struct infra_host_data*)e->data; + } + /* add 999 to round up the timeout value from msec to sec, + * then add a whole second so it is certain that this probe + * has timed out before the next is allowed */ + data->probedelay = timenow + ((*to)+1999)/1000; + } lock_rw_unlock(&e->lock); return 1; } @@ -498,6 +520,7 @@ infra_rtt_update(struct infra_cache* infra, rtt_lost(&data->rtt, orig_rtt); } else { rtt_update(&data->rtt, roundtrip); + data->probedelay = 0; } if(data->rtt.rto > 0) rto = data->rtt.rto; @@ -510,7 +533,7 @@ infra_rtt_update(struct infra_cache* infra, int infra_get_host_rto(struct infra_cache* infra, struct sockaddr_storage* addr, socklen_t addrlen, - struct rtt_info* rtt, uint32_t timenow) + struct rtt_info* rtt, int* delay, uint32_t timenow) { struct lruhash_entry* e = infra_lookup_host_nottl(infra, addr, addrlen, 0); @@ -521,6 +544,9 @@ int infra_get_host_rto(struct infra_cache* infra, if(data->ttl >= timenow) { ttl = (int)(data->ttl - timenow); memmove(rtt, &data->rtt, sizeof(*rtt)); + if(timenow < data->probedelay) + *delay = (int)(data->probedelay - timenow); + else *delay = 0; } lock_rw_unlock(&e->lock); return ttl; @@ -570,6 +596,10 @@ infra_get_lame_rtt(struct infra_cache* infra, return 0; host = (struct infra_host_data*)e->data; *rtt = rtt_unclamped(&host->rtt); + if(host->rtt.rto >= PROBE_MAXRTO && timenow < host->probedelay + && rtt_notimeout(&host->rtt)*4 <= host->rtt.rto) + /* single probe for this domain, and we are not probing */ + *rtt = USEFUL_SERVER_TOP_TIMEOUT; /* check lameness first, if so, ttl on host does not matter anymore */ if(infra_lookup_lame(host, name, namelen, timenow, &dlm, &rlm, &alm, &olm)) { @@ -604,6 +634,13 @@ infra_get_lame_rtt(struct infra_cache* infra, *dnsseclame = 0; *reclame = 0; if(timenow > host->ttl) { + /* expired entry */ + /* see if this can be a re-probe of an unresponsive server */ + if(host->rtt.rto >= USEFUL_SERVER_TOP_TIMEOUT) { + *rtt = USEFUL_SERVER_TOP_TIMEOUT-1; + lock_rw_unlock(&e->lock); + return 1; + } lock_rw_unlock(&e->lock); return 0; } diff --git a/services/cache/infra.h b/services/cache/infra.h index 9c203ee4d..376e1ae50 100644 --- a/services/cache/infra.h +++ b/services/cache/infra.h @@ -64,6 +64,8 @@ struct infra_host_key { struct infra_host_data { /** TTL value for this entry. absolute time. */ uint32_t ttl; + /** time in seconds (absolute) when probing re-commences, 0 disabled */ + uint32_t probedelay; /** round trip times for timeout calculation */ struct rtt_info rtt; /** Names of the zones that are lame. NULL=no lame zones. */ @@ -173,6 +175,8 @@ struct infra_host_data* infra_lookup_host(struct infra_cache* infra, * Find host information to send a packet. Creates new entry if not found. * Lameness is empty. EDNS is 0 (try with first), and rtt is returned for * the first message to it. + * Use this to send a packet only, because it also locks out others when + * probing is restricted. * @param infra: infrastructure cache. * @param addr: host address. * @param addrlen: length of addr. @@ -265,6 +269,7 @@ int infra_edns_update(struct infra_cache* infra, /** * Get Lameness information and average RTT if host is in the cache. + * This information is to be used for server selection. * @param infra: infrastructure cache. * @param addr: host address. * @param addrlen: length of addr. @@ -291,12 +296,13 @@ int infra_get_lame_rtt(struct infra_cache* infra, * @param addr: host address. * @param addrlen: length of addr. * @param rtt: the rtt_info is copied into here (caller alloced return struct). + * @param delay: probe delay (if any). * @param timenow: what time it is now. * @return TTL the infra host element is valid for. If -1: not found in cache. */ int infra_get_host_rto(struct infra_cache* infra, struct sockaddr_storage* addr, socklen_t addrlen, - struct rtt_info* rtt, uint32_t timenow); + struct rtt_info* rtt, int* delay, uint32_t timenow); /** * Get memory used by the infra cache.