diff --git a/src/lazyfree.c b/src/lazyfree.c index 645da2b34..80c4607d3 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -39,6 +39,15 @@ void lazyFreeTrackingTable(void *args[]) { atomicIncr(lazyfreed_objects,len); } +/* Release the error stats rax tree. */ +void lazyFreeErrors(void *args[]) { + rax *errors = args[0]; + size_t len = errors->numele; + raxFreeWithCallback(errors, zfree); + atomicDecr(lazyfree_objects,len); + atomicIncr(lazyfreed_objects,len); +} + /* Release the lua_scripts dict. */ void lazyFreeLuaScripts(void *args[]) { dict *lua_scripts = args[0]; @@ -202,6 +211,18 @@ void freeTrackingRadixTreeAsync(rax *tracking) { } } +/* Free the error stats rax tree. + * If the rax tree is huge enough, free it in async way. */ +void freeErrorsRadixTreeAsync(rax *errors) { + /* Because this rax has only keys and no values so we use numnodes. */ + if (errors->numnodes > LAZYFREE_THRESHOLD) { + atomicIncr(lazyfree_objects,errors->numele); + bioCreateLazyFreeJob(lazyFreeErrors,1,errors); + } else { + raxFreeWithCallback(errors, zfree); + } +} + /* Free lua_scripts dict and lru list, if the dict is huge enough, free them in async way. * Close lua interpreter, if there are a lot of lua scripts, close it in async way. */ void freeLuaScriptsAsync(dict *lua_scripts, list *lua_scripts_lru_list, lua_State *lua) { diff --git a/src/server.c b/src/server.c index 29637dae5..50c27b4bc 100644 --- a/src/server.c +++ b/src/server.c @@ -2605,6 +2605,7 @@ void initServer(void) { server.main_thread_id = pthread_self(); server.current_client = NULL; server.errors = raxNew(); + server.errors_enabled = 1; server.execution_nesting = 0; server.clients = listCreate(); server.clients_index = raxNew(); @@ -3109,8 +3110,9 @@ void resetCommandTableStats(dict* commands) { } void resetErrorTableStats(void) { - raxFreeWithCallback(server.errors, zfree); + freeErrorsRadixTreeAsync(server.errors); server.errors = raxNew(); + server.errors_enabled = 1; } /* ========================== Redis OP Array API ============================ */ @@ -4201,9 +4203,49 @@ int processCommand(client *c) { /* ====================== Error lookup and execution ===================== */ +/* Users who abuse lua error_reply will generate a new error object on each + * error call, which can make server.errors get bigger and bigger. This will + * cause the server to block when calling INFO (we also return errorstats by + * default). To prevent the damage it can cause, when a misuse is detected, + * we will print the warning log and disable the errorstats to avoid adding + * more new errors. It can be re-enabled via CONFIG RESETSTAT. */ +#define ERROR_STATS_NUMBER 128 void incrementErrorCount(const char *fullerr, size_t namelen) { + /* errorstats is disabled, return ASAP. */ + if (!server.errors_enabled) return; + void *result; if (!raxFind(server.errors,(unsigned char*)fullerr,namelen,&result)) { + if (server.errors->numele >= ERROR_STATS_NUMBER) { + sds errors = sdsempty(); + raxIterator ri; + raxStart(&ri, server.errors); + raxSeek(&ri, "^", NULL, 0); + while (raxNext(&ri)) { + char *tmpsafe; + errors = sdscatlen(errors, getSafeInfoString((char *)ri.key, ri.key_len, &tmpsafe), ri.key_len); + errors = sdscatlen(errors, ", ", 2); + if (tmpsafe != NULL) zfree(tmpsafe); + } + sdsrange(errors, 0, -3); /* Remove final ", ". */ + raxStop(&ri); + + /* Print the warning log and the contents of server.errors to the log. */ + serverLog(LL_WARNING, + "Errorstats stopped adding new errors because the number of " + "errors reached the limit, may be misuse of lua error_reply, " + "please check INFO ERRORSTATS, this can be re-enabled via " + "CONFIG RESETSTAT."); + serverLog(LL_WARNING, "Current errors code list: %s", errors); + sdsfree(errors); + + /* Reset the errors and add a single element to indicate that it is disabled. */ + resetErrorTableStats(); + incrementErrorCount("ERRORSTATS_DISABLED", 19); + server.errors_enabled = 0; + return; + } + struct redisError *error = zmalloc(sizeof(*error)); error->count = 1; raxInsert(server.errors,(unsigned char*)fullerr,namelen,error,NULL); diff --git a/src/server.h b/src/server.h index a021837bb..411edadec 100644 --- a/src/server.h +++ b/src/server.h @@ -1563,6 +1563,7 @@ struct redisServer { dict *orig_commands; /* Command table before command renaming. */ aeEventLoop *el; rax *errors; /* Errors table */ + int errors_enabled; /* If true, errorstats is enabled, and we will add new errors. */ unsigned int lruclock; /* Clock for LRU eviction */ volatile sig_atomic_t shutdown_asap; /* Shutdown ordered by signal handler. */ mstime_t shutdown_mstime; /* Timestamp to limit graceful shutdown. */ @@ -2703,6 +2704,7 @@ void trackingHandlePendingKeyInvalidations(void); void trackingInvalidateKeysOnFlush(int async); void freeTrackingRadixTree(rax *rt); void freeTrackingRadixTreeAsync(rax *rt); +void freeErrorsRadixTreeAsync(rax *errors); void trackingLimitUsedSlots(void); uint64_t trackingGetTotalItems(void); uint64_t trackingGetTotalKeys(void); diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index fdd736ee2..6e2d381f5 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -274,6 +274,27 @@ start_server {tags {"info" "external:skip"}} { $rd close } + test {errorstats: limit errors will not increase indefinitely} { + r config resetstat + for {set j 1} {$j <= 1100} {incr j} { + assert_error "$j my error message" { + r eval {return redis.error_reply(string.format('%s my error message', ARGV[1]))} 0 $j + } + } + + assert_equal [count_log_message 0 "Errorstats stopped adding new errors"] 1 + assert_equal [count_log_message 0 "Current errors code list"] 1 + assert_equal "count=1" [errorstat ERRORSTATS_DISABLED] + + # Since we currently have no metrics exposed for server.errors, we use lazyfree + # to verify that we only have 128 errors. + wait_for_condition 50 100 { + [s lazyfreed_objects] eq 128 + } else { + fail "errorstats resetstat lazyfree error" + } + } + test {stats: eventloop metrics} { set info1 [r info stats] set cycle1 [getInfoProperty $info1 eventloop_cycles]