diff --git a/src/Makefile b/src/Makefile index 49e83da1c..f0064d4fe 100644 --- a/src/Makefile +++ b/src/Makefile @@ -354,7 +354,7 @@ endif REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX) REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX) -REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o +REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o mstr.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX) REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o redisassert.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX) diff --git a/src/aof.c b/src/aof.c index 3987bb296..610a5c3f4 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1950,8 +1950,10 @@ static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) { else return rioWriteBulkLongLong(r, vll); } else if (hi->encoding == OBJ_ENCODING_HT) { - sds value = hashTypeCurrentFromHashTable(hi, what); - return rioWriteBulkString(r, value, sdslen(value)); + char *str; + size_t len; + hashTypeCurrentFromHashTable(hi, what, &str, &len, NULL); + return rioWriteBulkString(r, str, len); } serverPanic("Unknown hash encoding"); @@ -1962,10 +1964,10 @@ static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) { * The function returns 0 on error, 1 on success. */ int rewriteHashObject(rio *r, robj *key, robj *o) { hashTypeIterator *hi; - long long count = 0, items = hashTypeLength(o); + long long count = 0, items = hashTypeLength(o, 0); hi = hashTypeInitIterator(o); - while (hashTypeNext(hi) != C_ERR) { + while (hashTypeNext(hi, 0) != C_ERR) { if (count == 0) { int cmd_items = (items > AOF_REWRITE_ITEMS_PER_CMD) ? AOF_REWRITE_ITEMS_PER_CMD : items; diff --git a/src/commands.def b/src/commands.def index 8f1bdf50f..b9416812a 100644 --- a/src/commands.def +++ b/src/commands.def @@ -3303,6 +3303,104 @@ struct COMMAND_ARG HEXISTS_Args[] = { {MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, }; +/********** HEXPIRE ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HEXPIRE history */ +#define HEXPIRE_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HEXPIRE tips */ +#define HEXPIRE_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HEXPIRE key specs */ +keySpec HEXPIRE_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HEXPIRE condition argument table */ +struct COMMAND_ARG HEXPIRE_condition_Subargs[] = { +{MAKE_ARG("nx",ARG_TYPE_PURE_TOKEN,-1,"NX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("xx",ARG_TYPE_PURE_TOKEN,-1,"XX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("gt",ARG_TYPE_PURE_TOKEN,-1,"GT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("lt",ARG_TYPE_PURE_TOKEN,-1,"LT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* HEXPIRE argument table */ +struct COMMAND_ARG HEXPIRE_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("seconds",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("condition",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,4,NULL),.subargs=HEXPIRE_condition_Subargs}, +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/********** HEXPIREAT ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HEXPIREAT history */ +#define HEXPIREAT_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HEXPIREAT tips */ +#define HEXPIREAT_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HEXPIREAT key specs */ +keySpec HEXPIREAT_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HEXPIREAT condition argument table */ +struct COMMAND_ARG HEXPIREAT_condition_Subargs[] = { +{MAKE_ARG("nx",ARG_TYPE_PURE_TOKEN,-1,"NX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("xx",ARG_TYPE_PURE_TOKEN,-1,"XX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("gt",ARG_TYPE_PURE_TOKEN,-1,"GT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("lt",ARG_TYPE_PURE_TOKEN,-1,"LT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* HEXPIREAT argument table */ +struct COMMAND_ARG HEXPIREAT_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("unix-time-seconds",ARG_TYPE_UNIX_TIME,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("condition",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,4,NULL),.subargs=HEXPIREAT_condition_Subargs}, +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/********** HEXPIRETIME ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HEXPIRETIME history */ +#define HEXPIRETIME_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HEXPIRETIME tips */ +#define HEXPIRETIME_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HEXPIRETIME key specs */ +keySpec HEXPIRETIME_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HEXPIRETIME argument table */ +struct COMMAND_ARG HEXPIRETIME_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + /********** HGET ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -3512,6 +3610,156 @@ struct COMMAND_ARG HMSET_Args[] = { {MAKE_ARG("data",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,2,NULL),.subargs=HMSET_data_Subargs}, }; +/********** HPERSIST ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HPERSIST history */ +#define HPERSIST_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HPERSIST tips */ +#define HPERSIST_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HPERSIST key specs */ +keySpec HPERSIST_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HPERSIST argument table */ +struct COMMAND_ARG HPERSIST_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/********** HPEXPIRE ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HPEXPIRE history */ +#define HPEXPIRE_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HPEXPIRE tips */ +#define HPEXPIRE_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HPEXPIRE key specs */ +keySpec HPEXPIRE_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HPEXPIRE condition argument table */ +struct COMMAND_ARG HPEXPIRE_condition_Subargs[] = { +{MAKE_ARG("nx",ARG_TYPE_PURE_TOKEN,-1,"NX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("xx",ARG_TYPE_PURE_TOKEN,-1,"XX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("gt",ARG_TYPE_PURE_TOKEN,-1,"GT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("lt",ARG_TYPE_PURE_TOKEN,-1,"LT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* HPEXPIRE argument table */ +struct COMMAND_ARG HPEXPIRE_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("milliseconds",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("condition",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,4,NULL),.subargs=HPEXPIRE_condition_Subargs}, +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/********** HPEXPIREAT ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HPEXPIREAT history */ +#define HPEXPIREAT_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HPEXPIREAT tips */ +#define HPEXPIREAT_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HPEXPIREAT key specs */ +keySpec HPEXPIREAT_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HPEXPIREAT condition argument table */ +struct COMMAND_ARG HPEXPIREAT_condition_Subargs[] = { +{MAKE_ARG("nx",ARG_TYPE_PURE_TOKEN,-1,"NX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("xx",ARG_TYPE_PURE_TOKEN,-1,"XX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("gt",ARG_TYPE_PURE_TOKEN,-1,"GT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("lt",ARG_TYPE_PURE_TOKEN,-1,"LT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* HPEXPIREAT argument table */ +struct COMMAND_ARG HPEXPIREAT_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("unix-time-milliseconds",ARG_TYPE_UNIX_TIME,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("condition",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,4,NULL),.subargs=HPEXPIREAT_condition_Subargs}, +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/********** HPEXPIRETIME ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HPEXPIRETIME history */ +#define HPEXPIRETIME_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HPEXPIRETIME tips */ +#define HPEXPIRETIME_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HPEXPIRETIME key specs */ +keySpec HPEXPIRETIME_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HPEXPIRETIME argument table */ +struct COMMAND_ARG HPEXPIRETIME_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/********** HPTTL ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HPTTL history */ +#define HPTTL_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HPTTL tips */ +#define HPTTL_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HPTTL key specs */ +keySpec HPTTL_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HPTTL argument table */ +struct COMMAND_ARG HPTTL_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + /********** HRANDFIELD ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -3659,6 +3907,32 @@ struct COMMAND_ARG HSTRLEN_Args[] = { {MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, }; +/********** HTTL ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HTTL history */ +#define HTTL_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HTTL tips */ +#define HTTL_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HTTL key specs */ +keySpec HTTL_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HTTL argument table */ +struct COMMAND_ARG HTTL_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + /********** HVALS ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -10710,6 +10984,9 @@ struct COMMAND_STRUCT redisCommandTable[] = { /* hash */ {MAKE_CMD("hdel","Deletes one or more fields and their values from a hash. Deletes the hash if no fields remain.","O(N) where N is the number of fields to be removed.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HDEL_History,1,HDEL_Tips,0,hdelCommand,-3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_HASH,HDEL_Keyspecs,1,NULL,2),.args=HDEL_Args}, {MAKE_CMD("hexists","Determines whether a field exists in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXISTS_History,0,HEXISTS_Tips,0,hexistsCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HEXISTS_Keyspecs,1,NULL,2),.args=HEXISTS_Args}, +{MAKE_CMD("hexpire","Set expiry for hash field using relative time to expire (seconds)","O(N) where N is the number of arguments to the command","8.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXPIRE_History,0,HEXPIRE_Tips,0,hexpireCommand,-5,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HEXPIRE_Keyspecs,1,NULL,5),.args=HEXPIRE_Args}, +{MAKE_CMD("hexpireat","Set expiry for hash field using an absolute Unix timestamp (seconds)","O(N) where N is the number of arguments to the command","8.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXPIREAT_History,0,HEXPIREAT_Tips,0,hexpireatCommand,-5,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HEXPIREAT_Keyspecs,1,NULL,5),.args=HEXPIREAT_Args}, +{MAKE_CMD("hexpiretime","Returns the expiration time of a hash field as a Unix timestamp, in seconds.","O(N) where N is the number of arguments to the command","8.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXPIRETIME_History,0,HEXPIRETIME_Tips,0,hexpiretimeCommand,-4,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HEXPIRETIME_Keyspecs,1,NULL,3),.args=HEXPIRETIME_Args}, {MAKE_CMD("hget","Returns the value of a field in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGET_History,0,HGET_Tips,0,hgetCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HGET_Keyspecs,1,NULL,2),.args=HGET_Args}, {MAKE_CMD("hgetall","Returns all fields and values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGETALL_History,0,HGETALL_Tips,1,hgetallCommand,2,CMD_READONLY,ACL_CATEGORY_HASH,HGETALL_Keyspecs,1,NULL,1),.args=HGETALL_Args}, {MAKE_CMD("hincrby","Increments the integer value of a field in a hash by a number. Uses 0 as initial value if the field doesn't exist.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HINCRBY_History,0,HINCRBY_Tips,0,hincrbyCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HINCRBY_Keyspecs,1,NULL,3),.args=HINCRBY_Args}, @@ -10718,11 +10995,17 @@ struct COMMAND_STRUCT redisCommandTable[] = { {MAKE_CMD("hlen","Returns the number of fields in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HLEN_History,0,HLEN_Tips,0,hlenCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HLEN_Keyspecs,1,NULL,1),.args=HLEN_Args}, {MAKE_CMD("hmget","Returns the values of all fields in a hash.","O(N) where N is the number of fields being requested.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HMGET_History,0,HMGET_Tips,0,hmgetCommand,-3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HMGET_Keyspecs,1,NULL,2),.args=HMGET_Args}, {MAKE_CMD("hmset","Sets the values of multiple fields.","O(N) where N is the number of fields being set.","2.0.0",CMD_DOC_DEPRECATED,"`HSET` with multiple field-value pairs","4.0.0","hash",COMMAND_GROUP_HASH,HMSET_History,0,HMSET_Tips,0,hsetCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HMSET_Keyspecs,1,NULL,2),.args=HMSET_Args}, +{MAKE_CMD("hpersist","Removes the expiration time for each specified field","O(N) where N is the number of arguments to the command","8.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPERSIST_History,0,HPERSIST_Tips,0,hpersistCommand,-4,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HPERSIST_Keyspecs,1,NULL,3),.args=HPERSIST_Args}, +{MAKE_CMD("hpexpire","Set expiry for hash field using relative time to expire (milliseconds)","O(N) where N is the number of arguments to the command","8.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPEXPIRE_History,0,HPEXPIRE_Tips,0,hpexpireCommand,-5,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HPEXPIRE_Keyspecs,1,NULL,5),.args=HPEXPIRE_Args}, +{MAKE_CMD("hpexpireat","Set expiry for hash field using an absolute Unix timestamp (milliseconds)","O(N) where N is the number of arguments to the command","8.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPEXPIREAT_History,0,HPEXPIREAT_Tips,0,hpexpireatCommand,-5,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HPEXPIREAT_Keyspecs,1,NULL,5),.args=HPEXPIREAT_Args}, +{MAKE_CMD("hpexpiretime","Returns the expiration time of a hash field as a Unix timestamp, in msec.","O(N) where N is the number of arguments to the command","8.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPEXPIRETIME_History,0,HPEXPIRETIME_Tips,0,hpexpiretimeCommand,-4,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HPEXPIRETIME_Keyspecs,1,NULL,3),.args=HPEXPIRETIME_Args}, +{MAKE_CMD("hpttl","Returns the TTL in milliseconds of a hash field.","O(N) where N is the number of arguments to the command","8.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPTTL_History,0,HPTTL_Tips,0,hpttlCommand,-4,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HPTTL_Keyspecs,1,NULL,3),.args=HPTTL_Args}, {MAKE_CMD("hrandfield","Returns one or more random fields from a hash.","O(N) where N is the number of fields returned","6.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HRANDFIELD_History,0,HRANDFIELD_Tips,1,hrandfieldCommand,-2,CMD_READONLY,ACL_CATEGORY_HASH,HRANDFIELD_Keyspecs,1,NULL,2),.args=HRANDFIELD_Args}, {MAKE_CMD("hscan","Iterates over fields and values of a hash.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSCAN_History,0,HSCAN_Tips,1,hscanCommand,-3,CMD_READONLY,ACL_CATEGORY_HASH,HSCAN_Keyspecs,1,NULL,5),.args=HSCAN_Args}, {MAKE_CMD("hset","Creates or modifies the value of a field in a hash.","O(1) for each field/value pair added, so O(N) to add N field/value pairs when the command is called with multiple field/value pairs.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSET_History,1,HSET_Tips,0,hsetCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HSET_Keyspecs,1,NULL,2),.args=HSET_Args}, {MAKE_CMD("hsetnx","Sets the value of a field in a hash only when the field doesn't exist.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSETNX_History,0,HSETNX_Tips,0,hsetnxCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HSETNX_Keyspecs,1,NULL,3),.args=HSETNX_Args}, {MAKE_CMD("hstrlen","Returns the length of the value of a field.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSTRLEN_History,0,HSTRLEN_Tips,0,hstrlenCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HSTRLEN_Keyspecs,1,NULL,2),.args=HSTRLEN_Args}, +{MAKE_CMD("httl","Returns the TTL in seconds of a hash field.","O(N) where N is the number of arguments to the command","8.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HTTL_History,0,HTTL_Tips,0,httlCommand,-4,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HTTL_Keyspecs,1,NULL,3),.args=HTTL_Args}, {MAKE_CMD("hvals","Returns all values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HVALS_History,0,HVALS_Tips,1,hvalsCommand,2,CMD_READONLY,ACL_CATEGORY_HASH,HVALS_Keyspecs,1,NULL,1),.args=HVALS_Args}, /* hyperloglog */ {MAKE_CMD("pfadd","Adds elements to a HyperLogLog key. Creates the key if it doesn't exist.","O(1) to add every element.","2.8.9",CMD_DOC_NONE,NULL,NULL,"hyperloglog",COMMAND_GROUP_HYPERLOGLOG,PFADD_History,0,PFADD_Tips,0,pfaddCommand,-2,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HYPERLOGLOG,PFADD_Keyspecs,1,NULL,2),.args=PFADD_Args}, diff --git a/src/commands/hexpire.json b/src/commands/hexpire.json new file mode 100644 index 000000000..c36c5f80e --- /dev/null +++ b/src/commands/hexpire.json @@ -0,0 +1,118 @@ +{ + "HEXPIRE": { + "summary": "Set expiry for hash field using relative time to expire (seconds)", + "complexity": "O(N) where N is the number of arguments to the command", + "group": "hash", + "since": "8.0.0", + "arity": -5, + "function": "hexpireCommand", + "history": [], + "command_flags": [ + "WRITE", + "DENYOOM", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "oneOf": [ + { + "description": "Key does not exist.", + "type": "null" + }, + { + "description": "Array of results", + "type": "array", + "minItems": 1, + "maxItems": 4294967295, + "items": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "Specified NX | XX | GT | LT condition not met", + "const": 0 + }, + { + "description": "Expiration time was set or updated.", + "const": 1 + }, + { + "description": "Field deleted because the specified expiration time is in the past.", + "const": 2 + } + ] + } + ] + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "seconds", + "type": "integer" + }, + { + "name": "condition", + "type": "oneof", + "optional": true, + "arguments": [ + { + "name": "nx", + "type": "pure-token", + "token": "NX" + }, + { + "name": "xx", + "type": "pure-token", + "token": "XX" + }, + { + "name": "gt", + "type": "pure-token", + "token": "GT" + }, + { + "name": "lt", + "type": "pure-token", + "token": "LT" + } + ] + }, + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } +} diff --git a/src/commands/hexpireat.json b/src/commands/hexpireat.json new file mode 100644 index 000000000..40014fd64 --- /dev/null +++ b/src/commands/hexpireat.json @@ -0,0 +1,118 @@ +{ + "HEXPIREAT": { + "summary": "Set expiry for hash field using an absolute Unix timestamp (seconds)", + "complexity": "O(N) where N is the number of arguments to the command", + "group": "hash", + "since": "8.0.0", + "arity": -5, + "function": "hexpireatCommand", + "history": [], + "command_flags": [ + "WRITE", + "DENYOOM", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "oneOf": [ + { + "description": "Key does not exist.", + "type": "null" + }, + { + "description": "Array of results", + "type": "array", + "minItems": 1, + "maxItems": 4294967295, + "items": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "Specified NX | XX | GT | LT condition not met", + "const": 0 + }, + { + "description": "Expiration time was set or updated.", + "const": 1 + }, + { + "description": "Field deleted because the specified expiration time is in the past.", + "const": 2 + } + ] + } + ] + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "unix-time-seconds", + "type": "unix-time" + }, + { + "name": "condition", + "type": "oneof", + "optional": true, + "arguments": [ + { + "name": "nx", + "type": "pure-token", + "token": "NX" + }, + { + "name": "xx", + "type": "pure-token", + "token": "XX" + }, + { + "name": "gt", + "type": "pure-token", + "token": "GT" + }, + { + "name": "lt", + "type": "pure-token", + "token": "LT" + } + ] + }, + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } +} \ No newline at end of file diff --git a/src/commands/hexpiretime.json b/src/commands/hexpiretime.json new file mode 100644 index 000000000..a03b4889b --- /dev/null +++ b/src/commands/hexpiretime.json @@ -0,0 +1,83 @@ +{ + "HEXPIRETIME": { + "summary": "Returns the expiration time of a hash field as a Unix timestamp, in seconds.", + "complexity": "O(N) where N is the number of arguments to the command", + "group": "hash", + "since": "8.0.0", + "arity": -4, + "function": "hexpiretimeCommand", + "history": [], + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "oneOf": [ + { + "description": "Key does not exist.", + "type": "null" + }, + { + "description": "Array of results", + "type": "array", + "minItems": 1, + "maxItems": 4294967295, + "items": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "The field exists but has no associated expire.", + "const": -1 + }, + { + "description": "Expiration Unix timestamp in seconds.", + "type": "integer", + "minimum": 1 + } + ] + } + ] + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } +} diff --git a/src/commands/hpersist.json b/src/commands/hpersist.json new file mode 100644 index 000000000..2ba05820c --- /dev/null +++ b/src/commands/hpersist.json @@ -0,0 +1,82 @@ +{ + "HPERSIST": { + "summary": "Removes the expiration time for each specified field", + "complexity": "O(N) where N is the number of arguments to the command", + "group": "hash", + "since": "8.0.0", + "arity": -4, + "function": "hpersistCommand", + "history": [], + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "oneOf": [ + { + "description": "Key does not exist.", + "type": "null" + }, + { + "description": "Array of results", + "type": "array", + "minItems": 1, + "maxItems": 4294967295, + "items": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "The field exists but has no associated expire.", + "const": -1 + }, + { + "description": "Expiration time was removed", + "const": 1 + } + ] + } + ] + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } +} diff --git a/src/commands/hpexpire.json b/src/commands/hpexpire.json new file mode 100644 index 000000000..628c8ce16 --- /dev/null +++ b/src/commands/hpexpire.json @@ -0,0 +1,118 @@ +{ + "HPEXPIRE": { + "summary": "Set expiry for hash field using relative time to expire (milliseconds)", + "complexity": "O(N) where N is the number of arguments to the command", + "group": "hash", + "since": "8.0.0", + "arity": -5, + "function": "hpexpireCommand", + "history": [], + "command_flags": [ + "WRITE", + "DENYOOM", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "oneOf": [ + { + "description": "Key does not exist.", + "type": "null" + }, + { + "description": "Array of results", + "type": "array", + "minItems": 1, + "maxItems": 4294967295, + "items": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "Specified NX | XX | GT | LT condition not met", + "const": 0 + }, + { + "description": "Expiration time was set or updated.", + "const": 1 + }, + { + "description": "Field deleted because the specified expiration time is in the past.", + "const": 2 + } + ] + } + ] + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "milliseconds", + "type": "integer" + }, + { + "name": "condition", + "type": "oneof", + "optional": true, + "arguments": [ + { + "name": "nx", + "type": "pure-token", + "token": "NX" + }, + { + "name": "xx", + "type": "pure-token", + "token": "XX" + }, + { + "name": "gt", + "type": "pure-token", + "token": "GT" + }, + { + "name": "lt", + "type": "pure-token", + "token": "LT" + } + ] + }, + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } +} \ No newline at end of file diff --git a/src/commands/hpexpireat.json b/src/commands/hpexpireat.json new file mode 100644 index 000000000..63bc03820 --- /dev/null +++ b/src/commands/hpexpireat.json @@ -0,0 +1,118 @@ +{ + "HPEXPIREAT": { + "summary": "Set expiry for hash field using an absolute Unix timestamp (milliseconds)", + "complexity": "O(N) where N is the number of arguments to the command", + "group": "hash", + "since": "8.0.0", + "arity": -5, + "function": "hpexpireatCommand", + "history": [], + "command_flags": [ + "WRITE", + "DENYOOM", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "oneOf": [ + { + "description": "Key does not exist.", + "type": "null" + }, + { + "description": "Array of results", + "type": "array", + "minItems": 1, + "maxItems": 4294967295, + "items": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "Specified NX | XX | GT | LT condition not met", + "const": 0 + }, + { + "description": "Expiration time was set or updated.", + "const": 1 + }, + { + "description": "Field deleted because the specified expiration time is in the past.", + "const": 2 + } + ] + } + ] + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "unix-time-milliseconds", + "type": "unix-time" + }, + { + "name": "condition", + "type": "oneof", + "optional": true, + "arguments": [ + { + "name": "nx", + "type": "pure-token", + "token": "NX" + }, + { + "name": "xx", + "type": "pure-token", + "token": "XX" + }, + { + "name": "gt", + "type": "pure-token", + "token": "GT" + }, + { + "name": "lt", + "type": "pure-token", + "token": "LT" + } + ] + }, + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } +} \ No newline at end of file diff --git a/src/commands/hpexpiretime.json b/src/commands/hpexpiretime.json new file mode 100644 index 000000000..6f10229bf --- /dev/null +++ b/src/commands/hpexpiretime.json @@ -0,0 +1,83 @@ +{ + "HPEXPIRETIME": { + "summary": "Returns the expiration time of a hash field as a Unix timestamp, in msec.", + "complexity": "O(N) where N is the number of arguments to the command", + "group": "hash", + "since": "8.0.0", + "arity": -4, + "function": "hpexpiretimeCommand", + "history": [], + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "oneOf": [ + { + "description": "Key does not exist.", + "type": "null" + }, + { + "description": "The keyname, popped member, and its score.", + "type": "array", + "minItems": 1, + "maxItems": 4294967295, + "items": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "The field exists but has no associated expire.", + "const": -1 + }, + { + "description": "Expiration Unix timestamp in milliseconds.", + "type": "integer", + "minimum": 1 + } + ] + } + ] + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } +} diff --git a/src/commands/hpttl.json b/src/commands/hpttl.json new file mode 100644 index 000000000..23c8fd1e2 --- /dev/null +++ b/src/commands/hpttl.json @@ -0,0 +1,83 @@ +{ + "HPTTL": { + "summary": "Returns the TTL in milliseconds of a hash field.", + "complexity": "O(N) where N is the number of arguments to the command", + "group": "hash", + "since": "8.0.0", + "arity": -4, + "function": "hpttlCommand", + "history": [], + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "oneOf": [ + { + "description": "Key does not exist.", + "type": "null" + }, + { + "description": "The keyname, popped member, and its score.", + "type": "array", + "minItems": 1, + "maxItems": 4294967295, + "items": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "The field exists but has no associated expire.", + "const": -1 + }, + { + "description": "TTL in milliseconds.", + "type": "integer", + "minimum": 1 + } + ] + } + ] + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } +} diff --git a/src/commands/httl.json b/src/commands/httl.json new file mode 100644 index 000000000..d39483ed3 --- /dev/null +++ b/src/commands/httl.json @@ -0,0 +1,83 @@ +{ + "HTTL": { + "summary": "Returns the TTL in seconds of a hash field.", + "complexity": "O(N) where N is the number of arguments to the command", + "group": "hash", + "since": "8.0.0", + "arity": -4, + "function": "httlCommand", + "history": [], + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "oneOf": [ + { + "description": "Key does not exist.", + "type": "null" + }, + { + "description": "Array of results", + "type": "array", + "minItems": 1, + "maxItems": 4294967295, + "items": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "The field exists but has no associated expire.", + "const": -1 + }, + { + "description": "TTL in seconds.", + "type": "integer", + "minimum": 1 + } + ] + } + ] + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } +} diff --git a/src/db.c b/src/db.c index f25960c0f..0ceb54657 100644 --- a/src/db.c +++ b/src/db.c @@ -177,13 +177,13 @@ robj *lookupKeyWriteOrReply(client *c, robj *key, robj *reply) { * * If the update_if_existing argument is false, the program is aborted * if the key already exists, otherwise, it can fall back to dbOverwrite. */ -static void dbAddInternal(redisDb *db, robj *key, robj *val, int update_if_existing) { +static dictEntry *dbAddInternal(redisDb *db, robj *key, robj *val, int update_if_existing) { dictEntry *existing; int slot = getKeySlot(key->ptr); dictEntry *de = kvstoreDictAddRaw(db->keys, slot, key->ptr, &existing); if (update_if_existing && existing) { dbSetValue(db, key, val, 1, existing); - return; + return existing; } serverAssertWithInfo(NULL, key, de != NULL); kvstoreDictSetKey(db->keys, slot, de, sdsdup(key->ptr)); @@ -191,10 +191,11 @@ static void dbAddInternal(redisDb *db, robj *key, robj *val, int update_if_exist kvstoreDictSetVal(db->keys, slot, de, val); signalKeyAsReady(db, key, val->type); notifyKeyspaceEvent(NOTIFY_NEW,"new",key,db->id); + return de; } -void dbAdd(redisDb *db, robj *key, robj *val) { - dbAddInternal(db, key, val, 0); +dictEntry *dbAdd(redisDb *db, robj *key, robj *val) { + return dbAddInternal(db, key, val, 0); } /* Returns key's hash slot when cluster mode is enabled, or 0 when disabled. @@ -370,6 +371,11 @@ int dbGenericDelete(redisDb *db, robj *key, int async, int flags) { dictEntry *de = kvstoreDictTwoPhaseUnlinkFind(db->keys, slot, key->ptr, &plink, &table); if (de) { robj *val = dictGetVal(de); + + /* If hash object with expiry on fields, remove it from HFE DS of DB */ + if (val->type == OBJ_HASH) + hashTypeRemoveFromExpires(&db->hexpires, val); + /* RM_StringDMA may call dbUnshareStringValue which may free val, so we * need to incr to retain val */ incrRefCount(val); @@ -475,6 +481,9 @@ long long emptyDbStructure(redisDb *dbarray, int dbnum, int async, if (async) { emptyDbAsync(&dbarray[j]); } else { + /* Destroy global HFE DS before deleting the hashes since ebuckets + * DS is embedded in the stored objects. */ + ebDestroy(&dbarray[j].hexpires, &hashExpireBucketsType, NULL); kvstoreEmpty(dbarray[j].keys, callback); kvstoreEmpty(dbarray[j].expires, callback); } @@ -554,6 +563,7 @@ redisDb *initTempDb(void) { tempDb[i].id = i; tempDb[i].keys = kvstoreCreate(&dbDictType, slot_count_bits, flags); tempDb[i].expires = kvstoreCreate(&dbExpiresDictType, slot_count_bits, flags); + tempDb[i].hexpires = ebCreate(); } return tempDb; @@ -566,6 +576,9 @@ void discardTempDb(redisDb *tempDb, void(callback)(dict*)) { /* Release temp DBs. */ emptyDbStructure(tempDb, -1, async, callback); for (int i=0; itype == OBJ_HASH) ? hfieldlen : sdslen */ } scanData; /* Helper function to compare key type in scan commands */ @@ -918,7 +932,7 @@ void scanCallback(void *privdata, const dictEntry *de) { list *keys = data->keys; robj *o = data->o; sds val = NULL; - sds key = NULL; + void *key = NULL; /* if OBJ_HASH then key is of type `hfield`. Otherwise, `sds` */ data->sampled++; /* o and typename can not have values at the same time. */ @@ -932,24 +946,29 @@ void scanCallback(void *privdata, const dictEntry *de) { }*/ /* Filter element if it does not match the pattern. */ - sds keysds = dictGetKey(de); + void *keyStr = dictGetKey(de); if (data->pattern) { - if (!stringmatchlen(data->pattern, sdslen(data->pattern), keysds, sdslen(keysds), 0)) { + if (!stringmatchlen(data->pattern, sdslen(data->pattern), keyStr, data->strlen(keyStr), 0)) { return; } } if (o == NULL) { - key = keysds; + key = keyStr; } else if (o->type == OBJ_SET) { - key = keysds; + key = keyStr; } else if (o->type == OBJ_HASH) { - key = keysds; + key = keyStr; val = dictGetVal(de); + + /* If field is expired, then ignore */ + if (hfieldIsExpired(key)) + return; + } else if (o->type == OBJ_ZSET) { char buf[MAX_LONG_DOUBLE_CHARS]; int len = ld2string(buf, sizeof(buf), *(double *)dictGetVal(de), LD_STR_AUTO); - key = sdsdup(keysds); + key = sdsdup(keyStr); val = sdsnewlen(buf, len); } else { serverPanic("Type not handled in SCAN callback."); @@ -1023,6 +1042,7 @@ char *getObjectTypeName(robj *o) { * In the case of a Hash object the function returns both the field and value * of every element on the Hash. */ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { + int isKeysHfield = 0; int i, j; listNode *node; long count = 10; @@ -1103,6 +1123,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HT) { ht = o->ptr; } else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HT) { + isKeysHfield = 1; ht = o->ptr; } else if (o->type == OBJ_ZSET && o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; @@ -1141,7 +1162,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { * working on an empty dict, one with a lot of empty buckets, and * for the buckets are not empty, we need to limit the spampled number * to prevent a long hang time caused by filtering too many keys; - * 6. data.no_values: to control whether values will be returned or + * 6. data.no_values: to control whether values will be returned or * only keys are returned. */ scanData data = { .keys = keys, @@ -1150,6 +1171,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { .pattern = use_pattern ? pat : NULL, .sampled = 0, .no_values = no_values, + .strlen = (isKeysHfield) ? hfieldlen : sdslen, }; /* A pattern may restrict all matching keys to one cluster slot. */ @@ -1245,8 +1267,8 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { addReplyArrayLen(c, listLength(keys)); while ((node = listFirst(keys)) != NULL) { - sds key = listNodeValue(node); - addReplyBulkCBuffer(c, key, sdslen(key)); + void *key = listNodeValue(node); + addReplyBulkCBuffer(c, key, (isKeysHfield) ? mstrlen(key) : sdslen(key)); listDelNode(keys, node); } @@ -1339,6 +1361,7 @@ void renameGenericCommand(client *c, int nx) { robj *o; long long expire; int samekey = 0; + uint64_t minHashExpireTime = EB_EXPIRE_TIME_INVALID; /* When source and dest key is the same, no operation is performed, * if the key exists, however we still return an error on unexisting key. */ @@ -1364,9 +1387,21 @@ void renameGenericCommand(client *c, int nx) { * with the same name. */ dbDelete(c->db,c->argv[2]); } - dbAdd(c->db,c->argv[2],o); + dictEntry *de = dbAdd(c->db, c->argv[2], o); if (expire != -1) setExpire(c,c->db,c->argv[2],expire); + + /* If hash with expiration on fields then remove it from global HFE DS and + * keep next expiration time. Otherwise, dbDelete() will remove it from the + * global HFE DS and we will lose the expiration time. */ + if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HT) + minHashExpireTime = hashTypeRemoveFromExpires(&c->db->hexpires, o); + dbDelete(c->db,c->argv[1]); + + /* If hash with HFEs, register in db->hexpires */ + if (minHashExpireTime != EB_EXPIRE_TIME_INVALID) + hashTypeAddToExpires(c->db, dictGetKey(de), o, minHashExpireTime); + signalModifiedKey(c,c->db,c->argv[1]); signalModifiedKey(c,c->db,c->argv[2]); notifyKeyspaceEvent(NOTIFY_GENERIC,"rename_from", @@ -1390,6 +1425,7 @@ void moveCommand(client *c) { redisDb *src, *dst; int srcid, dbid; long long expire; + uint64_t hashExpireTime = EB_EXPIRE_TIME_INVALID; if (server.cluster_enabled) { addReplyError(c,"MOVE is not allowed in cluster mode"); @@ -1430,12 +1466,25 @@ void moveCommand(client *c) { addReply(c,shared.czero); return; } - dbAdd(dst,c->argv[1],o); + dictEntry *dstDictEntry = dbAdd(dst,c->argv[1],o); if (expire != -1) setExpire(c,dst,c->argv[1],expire); + + /* If hash with expiration on fields, remove it from global HFE DS and keep + * aside registered expiration time. Must be before deletion of the object. + * hexpires (ebuckets) embed in stored items its structure. */ + if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HT) + hashExpireTime = hashTypeRemoveFromExpires(&src->hexpires, o); + incrRefCount(o); /* OK! key moved, free the entry in the source DB */ dbDelete(src,c->argv[1]); + + /* If object of type hash with expiration on fields. Taken care to add the + * hash to hexpires of `dst` only after dbDelete(). */ + if (hashExpireTime != EB_EXPIRE_TIME_INVALID) + hashTypeAddToExpires(dst, dictGetKey(dstDictEntry), o, hashExpireTime); + signalModifiedKey(c,src,c->argv[1]); signalModifiedKey(c,dst,c->argv[1]); notifyKeyspaceEvent(NOTIFY_GENERIC, @@ -1518,12 +1567,13 @@ void copyCommand(client *c) { /* Duplicate object according to object's type. */ robj *newobj; + uint64_t minHashExpire = EB_EXPIRE_TIME_INVALID; /* HFE feature */ switch(o->type) { case OBJ_STRING: newobj = dupStringObject(o); break; case OBJ_LIST: newobj = listTypeDup(o); break; case OBJ_SET: newobj = setTypeDup(o); break; case OBJ_ZSET: newobj = zsetDup(o); break; - case OBJ_HASH: newobj = hashTypeDup(o); break; + case OBJ_HASH: newobj = hashTypeDup(o, newkey->ptr, &minHashExpire); break; case OBJ_STREAM: newobj = streamDup(o); break; case OBJ_MODULE: newobj = moduleTypeDupOrReply(c, key, newkey, dst->id, o); @@ -1538,8 +1588,15 @@ void copyCommand(client *c) { dbDelete(dst,newkey); } - dbAdd(dst,newkey,newobj); - if (expire != -1) setExpire(c, dst, newkey, expire); + dictEntry *deCopy = dbAdd(dst,newkey,newobj); + + /* if key with expiration then set it */ + if (expire != -1) + setExpire(c, dst, newkey, expire); + + /* If hash with expiration on fields then add it to 'dst' global HFE DS */ + if (minHashExpire != EB_EXPIRE_TIME_INVALID) + hashTypeAddToExpires(dst, dictGetKey(deCopy), newobj, minHashExpire); /* OK! key copied */ signalModifiedKey(c,dst,c->argv[2]); @@ -1629,11 +1686,13 @@ int dbSwapDatabases(int id1, int id2) { * remain in the same DB they were. */ db1->keys = db2->keys; db1->expires = db2->expires; + db1->hexpires = db2->hexpires; db1->avg_ttl = db2->avg_ttl; db1->expires_cursor = db2->expires_cursor; db2->keys = aux.keys; db2->expires = aux.expires; + db2->hexpires = aux.hexpires; db2->avg_ttl = aux.avg_ttl; db2->expires_cursor = aux.expires_cursor; @@ -1864,7 +1923,7 @@ int keyIsExpired(redisDb *db, robj *key) { * EXPIRE_AVOID_DELETE_EXPIRED flag. * * The return value of the function is KEY_VALID if the key is still valid. - * The function returns KEY_EXPIRED if the key is expired BUT not deleted, + * The function returns KEY_EXPIRED if the key is expired BUT not deleted, * or returns KEY_DELETED if the key is expired and deleted. */ keyStatus expireIfNeeded(redisDb *db, robj *key, int flags) { if (server.lazy_expire_disabled) return KEY_VALID; @@ -1878,7 +1937,7 @@ keyStatus expireIfNeeded(redisDb *db, robj *key, int flags) { * replicas. * * Still we try to return the right information to the caller, - * that is, KEY_VALID if we think the key should still be valid, + * that is, KEY_VALID if we think the key should still be valid, * KEY_EXPIRED if we think the key is expired but don't want to delete it at this time. * * When replicating commands from the master, keys are never considered diff --git a/src/debug.c b/src/debug.c index 5c3f5c7a5..6ce1bc71a 100644 --- a/src/debug.c +++ b/src/debug.c @@ -200,7 +200,7 @@ void xorObjectDigest(redisDb *db, robj *keyobj, unsigned char *digest, robj *o) } } else if (o->type == OBJ_HASH) { hashTypeIterator *hi = hashTypeInitIterator(o); - while (hashTypeNext(hi) != C_ERR) { + while (hashTypeNext(hi, 0) != C_ERR) { unsigned char eledigest[20]; sds sdsele; @@ -445,9 +445,9 @@ void debugCommand(client *c) { "SEGFAULT", " Crash the server with sigsegv.", "SET-ACTIVE-EXPIRE <0|1>", -" Setting it to 0 disables expiring keys in background when they are not", -" accessed (otherwise the Redis behavior). Setting it to 1 reenables back the", -" default.", +" Setting it to 0 disables expiring keys (and hash-fields) in background ", +" when they are not accessed (otherwise the Redis behavior). Setting it", +" to 1 reenables back the default.", "QUICKLIST-PACKED-THRESHOLD ", " Sets the threshold for elements to be inserted as plain vs packed nodes", " Default value is 1GB, allows values up to 4GB. Setting to 0 restores to default.", @@ -1081,7 +1081,7 @@ void serverLogObjectDebugInfo(const robj *o) { } else if (o->type == OBJ_SET) { serverLog(LL_WARNING,"Set size: %d", (int) setTypeSize(o)); } else if (o->type == OBJ_HASH) { - serverLog(LL_WARNING,"Hash size: %d", (int) hashTypeLength(o)); + serverLog(LL_WARNING,"Hash size: %d", (int) hashTypeLength(o, 0)); } else if (o->type == OBJ_ZSET) { serverLog(LL_WARNING,"Sorted set size: %d", (int) zsetLength(o)); if (o->encoding == OBJ_ENCODING_SKIPLIST) diff --git a/src/dict.c b/src/dict.c index 880042c69..2928d8af5 100644 --- a/src/dict.c +++ b/src/dict.c @@ -67,6 +67,25 @@ static int _dictInit(dict *d, dictType *type); static dictEntry *dictGetNext(const dictEntry *de); static dictEntry **dictGetNextRef(dictEntry *de); static void dictSetNext(dictEntry *de, dictEntry *next); +static int dictDefaultCompare(dict *d, const void *key1, const void *key2); + +/* -------------------------- misc inline functions -------------------------------- */ + +typedef int (*keyCmpFunc)(dict *d, const void *key1, const void *key2); +static inline keyCmpFunc dictGetKeyCmpFunc(dict *d) { + if (d->useStoredKeyApi && d->type->storedKeyCompare) + return d->type->storedKeyCompare; + if (d->type->keyCompare) + return d->type->keyCompare; + return dictDefaultCompare; +} + +static inline uint64_t dictHashKey(dict *d, const void *key, int isStoredKey) { + if (isStoredKey && d->type->storedHashFunction) + return d->type->storedHashFunction(key); + else + return d->type->hashFunction(key); +} /* -------------------------- hash functions -------------------------------- */ @@ -173,6 +192,19 @@ dict *dictCreate(dictType *type) return d; } +/* Change dictType of dict to another one with metadata support + * Rest of dictType's values must stay the same */ +void dictTypeAddMeta(dict **d, dictType *typeWithMeta) { + /* Verify new dictType is compatible with the old one */ + dictType toCmp = *typeWithMeta; + toCmp.dictMetadataBytes = NULL; /* Expected old one not to have metadata */ + toCmp.onDictRelease = (*d)->type->onDictRelease; /* Ignore 'onDictRelease' in comparison */ + assert(memcmp((*d)->type, &toCmp, sizeof(dictType)) == 0); /* The rest of the dictType fields must be the same */ + + *d = zrealloc(*d, sizeof(dict) + typeWithMeta->dictMetadataBytes(*d)); + (*d)->type = typeWithMeta; +} + /* Initialize the hash table */ int _dictInit(dict *d, dictType *type) { @@ -182,6 +214,7 @@ int _dictInit(dict *d, dictType *type) d->rehashidx = -1; d->pauserehash = 0; d->pauseAutoResize = 0; + d->useStoredKeyApi = 0; return DICT_OK; } @@ -285,7 +318,7 @@ static void rehashEntriesInBucketAtIndex(dict *d, uint64_t idx) { void *key = dictGetKey(de); /* Get the index in the new hash table */ if (d->ht_size_exp[1] > d->ht_size_exp[0]) { - h = dictHashKey(d, key) & DICTHT_SIZE_MASK(d->ht_size_exp[1]); + h = dictHashKey(d, key, 1) & DICTHT_SIZE_MASK(d->ht_size_exp[1]); } else { /* We're shrinking the table. The tables sizes are powers of * two, so we simply mask the bucket index in the larger table @@ -572,7 +605,7 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { /* dict is empty */ if (dictSize(d) == 0) return NULL; - h = dictHashKey(d, key); + h = dictHashKey(d, key, d->useStoredKeyApi); idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[0]); if (dictIsRehashing(d)) { @@ -587,6 +620,8 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { } } + keyCmpFunc cmpFunc = dictGetKeyCmpFunc(d); + for (table = 0; table <= 1; table++) { if (table == 0 && (long)idx < d->rehashidx) continue; idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[table]); @@ -594,7 +629,7 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { prevHe = NULL; while(he) { void *he_key = dictGetKey(he); - if (key == he_key || dictCompareKeys(d, key, he_key)) { + if (key == he_key || cmpFunc(d, key, he_key)) { /* Unlink the element from the list */ if (prevHe) dictSetNext(prevHe, dictGetNext(he)); @@ -689,6 +724,10 @@ void dictRelease(dict *d) * destroying the dict fake completion. */ if (dictIsRehashing(d) && d->type->rehashingCompleted) d->type->rehashingCompleted(d); + + if (d->type->onDictRelease) + d->type->onDictRelease(d); + _dictClear(d,0,NULL); _dictClear(d,1,NULL); zfree(d); @@ -701,8 +740,9 @@ dictEntry *dictFind(dict *d, const void *key) if (dictSize(d) == 0) return NULL; /* dict is empty */ - h = dictHashKey(d, key); + h = dictHashKey(d, key, d->useStoredKeyApi); idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[0]); + keyCmpFunc cmpFunc = dictGetKeyCmpFunc(d); if (dictIsRehashing(d)) { if ((long)idx >= d->rehashidx && d->ht_table[0][idx]) { @@ -722,7 +762,7 @@ dictEntry *dictFind(dict *d, const void *key) he = d->ht_table[table][idx]; while(he) { void *he_key = dictGetKey(he); - if (key == he_key || dictCompareKeys(d, key, he_key)) + if (key == he_key || cmpFunc(d, key, he_key)) return he; he = dictGetNext(he); } @@ -759,7 +799,9 @@ dictEntry *dictTwoPhaseUnlinkFind(dict *d, const void *key, dictEntry ***plink, if (dictSize(d) == 0) return NULL; /* dict is empty */ if (dictIsRehashing(d)) _dictRehashStep(d); - h = dictHashKey(d, key); + + h = dictHashKey(d, key, d->useStoredKeyApi); + keyCmpFunc cmpFunc = dictGetKeyCmpFunc(d); for (table = 0; table <= 1; table++) { idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[table]); @@ -767,7 +809,7 @@ dictEntry *dictTwoPhaseUnlinkFind(dict *d, const void *key, dictEntry ***plink, dictEntry **ref = &d->ht_table[table][idx]; while (ref && *ref) { void *de_key = dictGetKey(*ref); - if (key == de_key || dictCompareKeys(d, key, de_key)) { + if (key == de_key || cmpFunc(d, key, de_key)) { *table_index = table; *plink = ref; dictPauseRehashing(d); @@ -1530,8 +1572,8 @@ static signed char _dictNextExp(unsigned long size) void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing) { unsigned long idx, table; dictEntry *he; + uint64_t hash = dictHashKey(d, key, d->useStoredKeyApi); if (existing) *existing = NULL; - uint64_t hash = dictHashKey(d, key); idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[0]); if (dictIsRehashing(d)) { @@ -1548,6 +1590,8 @@ void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing) /* Expand the hash table if needed */ _dictExpandIfNeeded(d); + keyCmpFunc cmpFunc = dictGetKeyCmpFunc(d); + for (table = 0; table <= 1; table++) { if (table == 0 && (long)idx < d->rehashidx) continue; idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]); @@ -1555,7 +1599,7 @@ void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing) he = d->ht_table[table][idx]; while(he) { void *he_key = dictGetKey(he); - if (key == he_key || dictCompareKeys(d, key, he_key)) { + if (key == he_key || cmpFunc(d, key, he_key)) { if (existing) *existing = he; return NULL; } @@ -1587,7 +1631,7 @@ void dictSetResizeEnabled(dictResizeEnable enable) { } uint64_t dictGetHash(dict *d, const void *key) { - return dictHashKey(d, key); + return dictHashKey(d, key, d->useStoredKeyApi); } /* Finds the dictEntry using pointer and pre-calculated hash. @@ -1732,6 +1776,11 @@ void dictGetStats(char *buf, size_t bufsize, dict *d, int full) { orig_buf[orig_bufsize-1] = '\0'; } +static int dictDefaultCompare(dict *d, const void *key1, const void *key2) { + (void)(d); /*unused*/ + return key1 == key2; +} + /* ------------------------------- Benchmark ---------------------------------*/ #ifdef REDIS_TEST diff --git a/src/dict.h b/src/dict.h index 73a8ab052..1c0e6accd 100644 --- a/src/dict.h +++ b/src/dict.h @@ -62,6 +62,32 @@ typedef struct dictType { unsigned int keys_are_odd:1; /* TODO: Add a 'keys_are_even' flag and use a similar optimization if that * flag is set. */ + /* Sometimes we want the ability to store a key in a given way inside the hash + * function, and lookup it in some other way without resorting to any kind of + * conversion. For instance the key may be stored as a structure also + * representing other things, but the lookup happens via just a pointer to a + * null terminated string. Optionally providing additional hash/cmp functions, + * dict supports such usage. In that case we'll have a hashFunction() that will + * expect a null terminated C string, and a storedHashFunction() that will + * instead expect the structure. Similarly, the two comparison functions will + * work differently. The keyCompare() will treat the first argument as a pointer + * to a C string and the other as a structure (this way we can directly lookup + * the structure key using the C string). While the storedKeyCompare() will + * check if two pointers to the key in structure form are the same. + * + * However, functions of dict that gets key as argument (void *key) don't get + * any indication whether it is a lookup or stored key. To indicate that + * you intend to use key of type stored-key, and, consequently, use + * dedicated compare and hash functions of stored-key, is by calling + * dictUseStoredKeyApi(1) before using any of the dict functions that gets + * key as a parameter and then call again dictUseStoredKeyApi(0) once done. + * + * Set to NULL both functions, if you don't want to support this feature. */ + uint64_t (*storedHashFunction)(const void *key); + int (*storedKeyCompare)(dict *d, const void *key1, const void *key2); + + /* Optional callback called when the dict is destroyed. */ + void (*onDictRelease)(dict *d); } dictType; #define DICTHT_SIZE(exp) ((exp) == -1 ? 0 : (unsigned long)1<<(exp)) @@ -76,7 +102,9 @@ struct dict { long rehashidx; /* rehashing not in progress if rehashidx == -1 */ /* Keep small vars at end for optimal (minimal) struct padding */ - int16_t pauserehash; /* If >0 rehashing is paused (<0 indicates coding error) */ + unsigned pauserehash : 15; /* If >0 rehashing is paused */ + + unsigned useStoredKeyApi : 1; /* See comment of storedHashFunction above */ signed char ht_size_exp[2]; /* exponent of size. (size = 1<0 automatic resizing is disallowed (<0 indicates coding error) */ void *metadata[]; @@ -136,7 +164,6 @@ typedef struct { #define dictMetadataSize(d) ((d)->type->dictMetadataBytes \ ? (d)->type->dictMetadataBytes(d) : 0) -#define dictHashKey(d, key) ((d)->type->hashFunction(key)) #define dictBuckets(d) (DICTHT_SIZE((d)->ht_size_exp[0])+DICTHT_SIZE((d)->ht_size_exp[1])) #define dictSize(d) ((d)->ht_used[0]+(d)->ht_used[1]) #define dictIsEmpty(d) ((d)->ht_used[0] == 0 && (d)->ht_used[1] == 0) @@ -146,6 +173,7 @@ typedef struct { #define dictIsRehashingPaused(d) ((d)->pauserehash > 0) #define dictPauseAutoResize(d) ((d)->pauseAutoResize++) #define dictResumeAutoResize(d) ((d)->pauseAutoResize--) +#define dictUseStoredKeyApi(d, flag) ((d)->useStoredKeyApi = (flag)) /* If our unsigned long type can store a 64 bit number, use a 64 bit PRNG. */ #if ULONG_MAX >= 0xffffffffffffffff @@ -162,6 +190,7 @@ typedef enum { /* API */ dict *dictCreate(dictType *type); +void dictTypeAddMeta(dict **d, dictType *typeWithMeta); int dictExpand(dict *d, unsigned long size); int dictTryExpand(dict *d, unsigned long size); int dictShrink(dict *d, unsigned long size); diff --git a/src/ebuckets.c b/src/ebuckets.c new file mode 100644 index 000000000..f1450b0ac --- /dev/null +++ b/src/ebuckets.c @@ -0,0 +1,2254 @@ +/* + * Copyright Redis Ltd. 2024 - present + * + * Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) + * or the Server Side Public License v1 (SSPLv1). + */ + +#include +#include +#include +#include +#include "zmalloc.h" +#include "redisassert.h" +#include "config.h" +#include "ebuckets.h" + +#define UNUSED(x) (void)(x) + + +/*** DEBUGGING & VALIDATION + * + * To validate DS on add(), remove() and ebExpire() + * #define EB_VALIDATE_DEBUG 1 + */ + +/*** BENCHMARK + * + * > make REDIS_CFLAGS='-DREDIS_TEST -DEB_TEST_BENCHMARK' && ./src/redis-server test ebuckets + */ + +/* + * Keep just enough bytes of bucket-key, taking into consideration configured + * EB_BUCKET_KEY_PRECISION, and ignoring LSB bits that has no impact. + * + * The main motivation is that since the bucket-key size determines the maximum + * depth of the rax tree, then we can prune the tree to be more shallow and thus + * reduce the maintenance and traversal of each node in the B-tree. + */ +#if EB_BUCKET_KEY_PRECISION < 8 +#define EB_KEY_SIZE 6 +#elif EB_BUCKET_KEY_PRECISION >= 8 && EB_BUCKET_KEY_PRECISION < 16 +#define EB_KEY_SIZE 5 +#else +#define EB_KEY_SIZE 4 +#endif + +/* + * EB_SEG_MAX_ITEMS - Maximum number of items in rax-segment before trying to + * split. To simplify, it has the same value as EB_LIST_MAX_ITEMS. + */ +#define EB_SEG_MAX_ITEMS 16 +#define EB_LIST_MAX_ITEMS EB_SEG_MAX_ITEMS + +/* From expiration time to bucket-key */ +#define EB_BUCKET_KEY(exptime) ((exptime) >> EB_BUCKET_KEY_PRECISION) + + /* From bucket-key to expiration time */ +#define EB_BUCKET_EXP_TIME(bucketKey) ((uint64_t)(bucketKey) << EB_BUCKET_KEY_PRECISION) + +/*** structs ***/ + +typedef struct CommonSegHdr { + eItem head; +} CommonSegHdr; + + +/* FirstSegHdr - Header of first segment of a bucket. + * + * A bucket in rax tree with a single segment will be as follows: + * + * +-------------+ +------------+ +------------+ + * | FirstSegHdr | | eItem(1) | | eItem(N) | + * [rax] --> | eItem head | --> | void *next | --> ... --> | void *next | --+ + * +-------------+ +------------+ +------------+ | + * ^ | + * | | + * +-------------------------------------------------------+ + * + * Note that the cyclic references assist to update locally the segment(s) without + * the need to "heavy" traversal of the rax tree for each change. + */ +typedef struct FirstSegHdr { + eItem head; /* first item in the list */ + uint32_t totalItems; /* total items in the bucket, across chained segments */ + uint32_t numSegs; /* number of segments in the bucket */ +} FirstSegHdr; + +/* NextSegHdr - Header of next segment in an extended-segment (bucket) + * + * Here is the layout of an extended-segment, after adding another item to a single, + * full (EB_SEG_MAX_ITEMS=16), segment (all items must have same bucket-key value): + * + * +-------------+ +------------+ +------------+ +------------+ +------------+ + * | FirstSegHdr | | eItem(17) | | NextSegHdr | | eItem(1) | | eItem(16) | + * [rax] --> | eItem head | --> | void *next | --> | eItem head | --> | void *next | --> ... --> | void *next | --+ + * +-------------+ +------------+ +------------+ +------------+ +------------+ | + * ^ | ^ | + * | | | | + * +------------- firstSeg / prevSeg -+ +------------------------------------------------------+ + */ +typedef struct NextSegHdr { + eItem head; + CommonSegHdr *prevSeg; /* pointer to previous segment */ + FirstSegHdr *firstSeg; /* pointer to first segment of the bucket */ +} NextSegHdr; + +/* Selective copy of ifndef from server.h instead of including it */ +#ifndef static_assert +#define static_assert(expr, lit) extern char __static_assert_failure[(expr) ? 1:-1] +#endif +/* Verify that "head" field is aligned in FirstSegHdr, NextSegHdr and CommonSegHdr */ +static_assert(offsetof(FirstSegHdr, head) == 0, "FirstSegHdr head is not aligned"); +static_assert(offsetof(NextSegHdr, head) == 0, "FirstSegHdr head is not aligned"); +static_assert(offsetof(CommonSegHdr, head) == 0, "FirstSegHdr head is not aligned"); +/* Verify attached metadata to rax is aligned */ +static_assert(offsetof(rax, metadata) % sizeof(void*) == 0, "metadata field is not aligned in rax"); + +/* EBucketNew - Indicates the caller to create a new bucket following the addition + * of another item to a bucket (either single-segment or extended-segment). */ +typedef struct EBucketNew { + FirstSegHdr segment; + ExpireMeta *mLast; /* last item in the chain */ + uint64_t ebKey; +} EBucketNew; + +static void ebNewBucket(EbucketsType *type, EBucketNew *newBucket, eItem item, uint64_t key); +static int ebBucketPrint(uint64_t bucketKey, EbucketsType *type, FirstSegHdr *firstSeg); +static uint64_t *ebRaxNumItems(rax *rax); + +/*** Static functions ***/ + +/* Extract pointer to list from ebuckets handler */ +static inline rax *ebGetRaxPtr(ebuckets eb) { return (rax *)eb; } + +/* The lsb in ebuckets pointer determines whether the pointer points to rax or list. */ +static inline int ebIsList(ebuckets eb) { + return (((uintptr_t)(void *)eb & 0x1) == 1); +} +/* set lsb in ebuckets pointer to 1 to mark it as list. Unless empty (NULL) */ +static inline ebuckets ebMarkAsList(eItem item) { + if (item == NULL) return item; + + /* either 'itemsAddrAreOdd' or not, we end up with lsb is set to 1 */ + return (void *) ((uintptr_t) item | 1); +} + +/* Extract pointer to the list from ebuckets handler */ +static inline eItem ebGetListPtr(EbucketsType *type, ebuckets eb) { + /* if 'itemsAddrAreOdd' then no need to reset lsb bit */ + if (type->itemsAddrAreOdd) + return eb; + else + return (void*)((uintptr_t)(eb) & ~1); +} + +/* Converts the logical starting time value of a given bucket-key to its equivalent + * "physical" value in the context of an rax tree (rax-key). Although their values + * are the same, their memory layouts differ. The raxKey layout orders bytes in + * memory is from the MSB to the LSB, and the length of the key is EB_KEY_SIZE. */ +static inline void bucketKey2RaxKey(uint64_t bucketKey, unsigned char *raxKey) { + for (int i = EB_KEY_SIZE-1; i >= 0; --i) { + raxKey[i] = (unsigned char) (bucketKey & 0xFF); + bucketKey >>= 8; + } +} + +/* Converts the "physical" value of rax-key to its logical counterpart, representing + * the starting time value of a bucket. The values are equivalent, but their memory + * layouts differ. The raxKey is assumed to be ordered from the MSB to the LSB with + * a length of EB_KEY_SIZE. The resulting bucket-key is the logical representation + * with respect to ebuckets. */ +static inline uint64_t raxKey2BucketKey(unsigned char *raxKey) { + uint64_t bucketKey = 0; + for (int i = 0; i < EB_KEY_SIZE ; ++i) + bucketKey = (bucketKey<<8) + raxKey[i]; + return bucketKey; +} + +/* Add another item to a bucket that consists of extended-segments. In this + * scenario, all items in the bucket share the same bucket-key value and the first + * segment is already full (if not, the function ebSegAddAvail() would have being + * called). This requires the creation of another segment. The layout of the + * segments before and after the addition of the new item is as follows: + * + * Before: [segHdr] -> {item1,..,item16} -> [..] + * After: [segHdr] -> {newItem} -> [nextSegHdr] -> {item1,..,item16} -> [..] + * + * Take care to persist `segHdr` to be the same instance after the change. + * This is important because the rax tree is pointing to it. */ +static int ebSegAddExtended(EbucketsType *type, FirstSegHdr *firstSegHdr, eItem newItem) { + /* Allocate nextSegHdr and let it take the items of first segment header */ + NextSegHdr *nextSegHdr = zmalloc(sizeof(NextSegHdr)); + nextSegHdr->head = firstSegHdr->head; + /* firstSegHdr will stay the first and new nextSegHdr will follow it */ + nextSegHdr->prevSeg = (CommonSegHdr *) firstSegHdr; + nextSegHdr->firstSeg = firstSegHdr; + + ExpireMeta *mIter = type->getExpireMeta(nextSegHdr->head); + mIter->firstItemBucket = 0; + for (int i = 0 ; i < EB_SEG_MAX_ITEMS-1 ; i++) + mIter = type->getExpireMeta(mIter->next); + + if (mIter->lastItemBucket) { + mIter->next = nextSegHdr; + } else { + /* Update next-next-segment to point back to next-segment */ + NextSegHdr *nextNextSegHdr = mIter->next; + nextNextSegHdr->prevSeg = (CommonSegHdr *) nextSegHdr; + } + + firstSegHdr->numSegs += 1; + firstSegHdr->totalItems += 1; + firstSegHdr->head = newItem; + + ExpireMeta *mNewItem = type->getExpireMeta(newItem); + mNewItem->numItems = 1; + mNewItem->next = nextSegHdr; + mNewItem->firstItemBucket = 1; + mNewItem->lastInSegment = 1; + + return 0; +} + +/* Add another eItem to a segment with available space. Keep items sorted in ascending order */ +static int ebSegAddAvail(EbucketsType *type, FirstSegHdr *seg, eItem item) { + eItem head = seg->head; + ExpireMeta *nextMeta; + ExpireMeta *mHead = type->getExpireMeta(head); + ExpireMeta *mItem = type->getExpireMeta(item); + uint64_t itemExpireTime = ebGetMetaExpTime(mItem); + + seg->totalItems++; + + assert(mHead->numItems < EB_SEG_MAX_ITEMS); + + /* if new item expiry time is smaller than the head then add it before the head */ + if (ebGetMetaExpTime(mHead) > itemExpireTime) { + /* Insert item as the new head */ + mItem->next = head; + mItem->firstItemBucket = mHead->firstItemBucket; + mItem->numItems = mHead->numItems + 1; + mHead->firstItemBucket = 0; + mHead->numItems = 0; + seg->head = item; + return 0; + } + + /* Insert item in the middle of segment */ + ExpireMeta *mIter = mHead; + for (int i = 1 ; i < mHead->numItems ; i++) { + nextMeta = type->getExpireMeta(mIter->next); + /* Insert item in the middle */ + if (ebGetMetaExpTime(nextMeta) > itemExpireTime) { + mHead->numItems = mHead->numItems + 1; + mItem->next = mIter->next; + mIter->next = item; + return 0; + } + mIter = nextMeta; + } + + /* Insert item as the last item of the segment. Inherit flags from previous last item */ + mHead->numItems = mHead->numItems + 1; + mItem->next = mIter->next; + mItem->lastInSegment = mIter->lastInSegment; + mItem->lastItemBucket = mIter->lastItemBucket; + mIter->lastInSegment = 0; + mIter->lastItemBucket = 0; + mIter->next = item; + return 0; +} + +/* Return 1 if split segment to two succeeded. Else, return 0. The only reason + * the split can fail is that All the items in the segment have the same bucket-key */ +static int ebTrySegSplit(EbucketsType *type, FirstSegHdr *seg, EBucketNew *newBucket) { + int minMidDist=(EB_SEG_MAX_ITEMS / 2), bestMiddleIndex = -1; + uint64_t splitKey = -1; + eItem firstItemSecondPart; + ExpireMeta *mLastItemFirstPart, *mFirstItemSecondPart; + + eItem head = seg->head; + ExpireMeta *mHead = type->getExpireMeta(head); + ExpireMeta *mNext, *mIter = mHead; + + /* Search for best middle index to split the segment into two segments. As the + * items are arranged in ascending order, it cannot split between two items that + * have the same expiration time and therefore the split won't necessarily be + * balanced (Or won't be possible to split at all if all have the same exp-time!) + */ + for (int i = 0 ; i < EB_SEG_MAX_ITEMS-1 ; i++) { + //printf ("i=%d\n", i); + mNext = type->getExpireMeta(mIter->next); + if (EB_BUCKET_KEY(ebGetMetaExpTime(mNext)) > EB_BUCKET_KEY( + ebGetMetaExpTime(mIter))) { + /* If found better middle index before reaching halfway, save it */ + if (i < (EB_SEG_MAX_ITEMS/2)) { + splitKey = EB_BUCKET_KEY(ebGetMetaExpTime(mNext)); + bestMiddleIndex = i; + mLastItemFirstPart = mIter; + mFirstItemSecondPart = mNext; + firstItemSecondPart = mIter->next; + minMidDist = (EB_SEG_MAX_ITEMS / 2) - bestMiddleIndex; + } else { + /* after crossing the middle need only to look for the first diff */ + if (minMidDist > (i + 1 - EB_SEG_MAX_ITEMS / 2)) { + splitKey = EB_BUCKET_KEY(ebGetMetaExpTime(mNext)); + bestMiddleIndex = i; + mLastItemFirstPart = mIter; + mFirstItemSecondPart = mNext; + firstItemSecondPart = mIter->next; + minMidDist = i + 1 - EB_SEG_MAX_ITEMS / 2; + } + } + } + mIter = mNext; + } + + /* If cannot find index to split because all with same EB_BUCKET_KEY(), then + * segment should be treated as extended segment */ + if (bestMiddleIndex == -1) + return 0; + + /* New bucket */ + newBucket->segment.head = firstItemSecondPart; + newBucket->segment.numSegs = 1; + newBucket->segment.totalItems = EB_SEG_MAX_ITEMS - bestMiddleIndex - 1; + mFirstItemSecondPart->numItems = EB_SEG_MAX_ITEMS - bestMiddleIndex - 1; + newBucket->mLast = mIter; + newBucket->ebKey = splitKey; + mIter->lastInSegment = 1; + mIter->lastItemBucket = 1; + mIter->next = &newBucket->segment; /* to be updated by caller */ + mFirstItemSecondPart->firstItemBucket = 1; + + /* update existing bucket */ + seg->totalItems = bestMiddleIndex + 1; + mHead->numItems = bestMiddleIndex + 1; + mLastItemFirstPart->lastInSegment = 1; + mLastItemFirstPart->lastItemBucket = 1; + mLastItemFirstPart->next = seg; + return 1; +} + +/* Return 1 if managed to expire the entire segment. Returns 0 otherwise. */ +int ebSingleSegExpire(FirstSegHdr *firstSegHdr, + EbucketsType *type, + ExpireInfo *info, + eItem *updateList) +{ + uint64_t itemExpTime; + eItem iter = firstSegHdr->head; + ExpireMeta *mIter = type->getExpireMeta(iter); + uint32_t i=0, numItemsInSeg = mIter->numItems; + + while (info->itemsExpired < info->maxToExpire) { + itemExpTime = ebGetMetaExpTime(mIter); + + /* Items are arranged in ascending expire-time order in a segment. Stops + * active expiration when an item's expire time is greater than `now`. */ + if (itemExpTime > info->now) + break; + + /* keep aside next before deletion of iter */ + eItem next = mIter->next; + mIter->trash = 1; + ExpireAction act = info->onExpireItem(iter, info->ctx); + + /* if (act == ACT_REMOVE_EXP_ITEM) + * then don't touch the item. Assume it got deleted */ + + /* If indicated to stop then break (cb didn't delete the item) */ + if (act == ACT_STOP_ACTIVE_EXP) { + mIter->trash = 0; + break; + } + + if (act == ACT_UPDATE_EXP_ITEM) { + mIter->next = *updateList; + *updateList = iter; + } + + ++info->itemsExpired; + + /* if deleted all items in segment, delete header and return */ + if (++i == numItemsInSeg) { + zfree(firstSegHdr); + return 1; + } + + /* More items in the segment. Set iter to next item and update mIter */ + iter = next; + mIter = type->getExpireMeta(iter); + } + + /* Update the single-segment with remaining items */ + mIter->numItems = numItemsInSeg - i; + mIter->firstItemBucket = 1; + firstSegHdr->head = iter; + firstSegHdr->totalItems -= i; + + /* Update nextExpireTime */ + info->nextExpireTime = ebGetMetaExpTime(mIter); + + return 0; +} + +/* return 1 if managed to expire the entire segment. Returns 0 otherwise. */ +static int ebSegExpire(FirstSegHdr *firstSegHdr, + EbucketsType *type, + ExpireInfo *info, + eItem *updateList) +{ + eItem iter = firstSegHdr->head; + uint32_t numSegs = firstSegHdr->numSegs; + void *nextSegHdr = firstSegHdr; + + if (numSegs == 1) + return ebSingleSegExpire(firstSegHdr, type, info, updateList); + + /* + * In an extended-segment, there's no need to verify the expiration time of + * each item. This is because all items in an extended-segment share the same + * bucket-key. Therefore, we can remove all items without checking their + * individual expiration times. This is different from a single-segment + * scenario, where items can have different bucket-keys. + */ + for (uint32_t seg=0 ; seg < numSegs ; seg++) { + uint32_t i; + ExpireMeta *mIter = type->getExpireMeta(iter); + uint32_t numItemsInSeg = mIter->numItems; + + for (i = 0; (i < numItemsInSeg) && (info->itemsExpired < info->maxToExpire) ; ++i) { + mIter = type->getExpireMeta(iter); + eItem toDelete = iter; + iter = mIter->next; + mIter->trash = 1; + ExpireAction act = info->onExpireItem(toDelete, info->ctx); + + /* if (act == ACT_REMOVE_EXP_ITEM) + * then don't touch the item. Assume it got deleted */ + + /* If indicated to stop then break (callback didn't delete the item) */ + if (act == ACT_STOP_ACTIVE_EXP) { + mIter->trash = 0; + break; + } + + if (act == ACT_UPDATE_EXP_ITEM) { + mIter->next = *updateList; + *updateList = toDelete; + } + + ++info->itemsExpired; + firstSegHdr->totalItems -= 1; + } + + /* if deleted all items in segment */ + if (i == numItemsInSeg) { + /* If not last segment in bucket, then delete segment header */ + if (seg + 1 < numSegs) { + nextSegHdr = iter; + iter = ((NextSegHdr *) nextSegHdr)->head; + zfree(nextSegHdr); + firstSegHdr->numSegs -= 1; + firstSegHdr->head = iter; + mIter = type->getExpireMeta(iter); + mIter->firstItemBucket = 1; + } + } else { + /* We reached here because for-loop above break due to + * ACT_STOP_ACTIVE_EXP or reached maxToExpire */ + firstSegHdr->head = iter; + mIter = type->getExpireMeta(iter); + mIter->numItems = numItemsInSeg - i; + mIter->firstItemBucket = 1; + info->nextExpireTime = ebGetMetaExpTime(mIter); + + /* If deleted one or more segments, update prevSeg of next seg to point firstSegHdr. + * If it is the last segment, then last item need to point firstSegHdr */ + if (seg>0) { + int numItems = mIter->numItems; + for (int i = 0; i < numItems - 1; i++) + mIter = type->getExpireMeta(mIter->next); + + if (mIter->lastItemBucket) { + mIter->next = firstSegHdr; + } else { + /* Update next-segment to point back to firstSegHdr */ + NextSegHdr *nsh = mIter->next; + nsh->prevSeg = (CommonSegHdr *) firstSegHdr; + } + } + + return 0; + } + } + + /* deleted last segment in bucket */ + zfree(firstSegHdr); + return 1; +} + +/*** Static functions of list ***/ + +/* Convert a list to rax. + * + * To create a new rax, the function first converts the list to a segment by + * allocating a segment header and attaching to it the already existing list. + * Then, it adds the new segment to the rax as the first bucket. */ +static rax *ebConvertListToRax(eItem listHead, EbucketsType *type) { + FirstSegHdr *firstSegHdr = zmalloc(sizeof(FirstSegHdr)); + firstSegHdr->head = listHead; + firstSegHdr->totalItems = EB_LIST_MAX_ITEMS ; + firstSegHdr->numSegs = 1; + + /* update last item to point on the segment header */ + ExpireMeta *metaItem = type->getExpireMeta(listHead); + uint64_t bucketKey = EB_BUCKET_KEY(ebGetMetaExpTime(metaItem)); + while (metaItem->lastItemBucket == 0) + metaItem = type->getExpireMeta(metaItem->next); + metaItem->next = firstSegHdr; + + /* Use min expire-time for the first segment in rax */ + unsigned char raxKey[EB_KEY_SIZE]; + bucketKey2RaxKey(bucketKey, raxKey); + rax *rax = raxNewWithMetadata(sizeof(uint64_t)); + *ebRaxNumItems(rax) = EB_LIST_MAX_ITEMS; + raxInsert(rax, raxKey, EB_KEY_SIZE, firstSegHdr, NULL); + return rax; +} + +/** + * Adds another 'item' to the ebucket of type list, keeping the list sorted by + * ascending expiration time. + * + * @param eb - Pointer to the ebuckets handler of type list. Gets updated if the item is + * added as the new head. + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * @param item - The eItem to be added to the list. + * + * @return 1 if the maximum list length is reached; otherwise, return 0. + */ +static int ebAddToList(ebuckets *eb, EbucketsType *type, eItem item) { + ExpireMeta *metaItem = type->getExpireMeta(item); + + /* if ebucket-list is empty (NULL), then create a new list by marking 'item' + * as the head and tail of the list */ + if (unlikely(ebIsEmpty(*eb))) { + metaItem->next = NULL; + metaItem->numItems = 1; + metaItem->lastInSegment = 1; + metaItem->firstItemBucket = 1; + metaItem->lastItemBucket = 1; + *eb = ebMarkAsList(item); + return 0; + } + + eItem head = ebGetListPtr(type, *eb); + ExpireMeta *metaHead = type->getExpireMeta(head); + + /* If reached max items in list, then return 1 */ + if (metaHead->numItems == EB_LIST_MAX_ITEMS) + return 1; + + /* if expiry time of 'item' is smaller than the head then add it as the new head */ + if (ebGetMetaExpTime(metaHead) > ebGetMetaExpTime(metaItem)) { + /* Insert item as the new head */ + metaItem->next = head; + metaItem->firstItemBucket = 1; + metaItem->numItems = metaHead->numItems + 1; + metaHead->firstItemBucket = 0; + metaHead->numItems = 0; + *eb = ebMarkAsList(item); + return 0; + } + + + /* Try insert item in the middle of list */ + ExpireMeta *mIter = metaHead; + for (int i = 1 ; i < metaHead->numItems ; i++) { + ExpireMeta *nextMeta = type->getExpireMeta(mIter->next); + /* Insert item in the middle */ + if (ebGetMetaExpTime(nextMeta) > ebGetMetaExpTime(metaItem)) { + metaHead->numItems += 1; + metaItem->next = mIter->next; + mIter->next = item; + return 0; + } + mIter = nextMeta; + } + + /* Insert item as the last item of the list. */ + metaHead->numItems += 1; + metaItem->next = NULL; + metaItem->lastInSegment = 1; + metaItem->lastItemBucket = 1; + /* Update obsolete last item */ + mIter->lastInSegment = 0; + mIter->lastItemBucket = 0; + mIter->next = item; + return 0; +} + +/* return 1 if removed from list. Otherwise, return 0 */ +static int ebRemoveFromList(ebuckets *eb, EbucketsType *type, eItem item) { + if (ebIsEmpty(*eb)) + return 0; /* not removed */ + + ExpireMeta *metaItem = type->getExpireMeta(item); + eItem head = ebGetListPtr(type, *eb); + + /* if item is the head of the list */ + if (head == item) { + eItem newHead = metaItem->next; + if (newHead != NULL) { + ExpireMeta *mNewHead = type->getExpireMeta(newHead); + mNewHead->numItems = metaItem->numItems - 1; + mNewHead->firstItemBucket = 1; + *eb = ebMarkAsList(newHead); + return 1; /* removed */ + } + *eb = NULL; + return 1; /* removed */ + } + + /* item is not the head of the list */ + ExpireMeta *metaHead = type->getExpireMeta(head); + + eItem iter = head; + while (iter != NULL) { + ExpireMeta *metaIter = type->getExpireMeta(iter); + if (metaIter->next == item) { + metaIter->next = metaItem->next; + /* If deleted item is the last in the list, then update new last item */ + if (metaItem->next == NULL) { + metaIter->lastInSegment = 1; + metaIter->lastItemBucket = 1; + } + metaHead->numItems -= 1; + return 1; /* removed */ + } + iter = metaIter->next; + } + return 0; /* not removed */ +} + +/* return 1 if none left. Otherwise return 0 */ +static int ebListExpire(ebuckets *eb, + EbucketsType *type, + ExpireInfo *info, + eItem *updateList) +{ + uint32_t expired = 0; + eItem item = ebGetListPtr(type, *eb); + ExpireMeta *metaItem = type->getExpireMeta(item); + uint32_t numItems = metaItem->numItems; /* first item must exists */ + + while (item != NULL) { + metaItem = type->getExpireMeta(item); + uint64_t itemExpTime = ebGetMetaExpTime(metaItem); + + /* Items are arranged in ascending expire-time order in a list. Stops list + * active expiration when an item's expiration time is greater than `now`. */ + if (itemExpTime > info->now) + break; + + if (info->itemsExpired == info->maxToExpire) + break; + + eItem *next = metaItem->next; + metaItem->trash = 1; + ExpireAction act = info->onExpireItem(item, info->ctx); + + /* if (act == ACT_REMOVE_EXP_ITEM) + * then don't touch the item. Assume it got deleted */ + + /* If indicated to stop then break (cb didn't delete the item) */ + if (act == ACT_STOP_ACTIVE_EXP) { + metaItem->trash = 0; + break; + } + + if (act == ACT_UPDATE_EXP_ITEM) { + metaItem->next = *updateList; + *updateList = item; + } + + ++expired; + ++(info->itemsExpired); + item = next; + } + + if (expired == numItems) { + *eb = NULL; + info->nextExpireTime = 0; + return 1; + } + + metaItem->numItems = numItems - expired; + metaItem->firstItemBucket = 1; + info->nextExpireTime = ebGetMetaExpTime(metaItem); + *eb = ebMarkAsList(item); + return 0; +} + +/* Validate the general structure of the list */ +static void ebValidateList(eItem head, EbucketsType *type) { + if (head == NULL) + return; + + ExpireMeta *mHead = type->getExpireMeta(head); + eItem iter = head; + ExpireMeta *mIter = type->getExpireMeta(iter), *mIterPrev = NULL; + + for (int i = 0; i < mHead->numItems ; ++i) { + mIter = type->getExpireMeta(iter); + if (i == 0) { + /* first item */ + assert(mIter->numItems > 0 && mIter->numItems <= EB_LIST_MAX_ITEMS); + assert(mIter->firstItemBucket == 1); + } else { + /* Verify that expire time of previous item is smaller or equal */ + assert(ebGetMetaExpTime(mIterPrev) <= ebGetMetaExpTime(mIter)); + assert(mIter->numItems == 0); + assert(mIter->firstItemBucket == 0); + } + + if (i == (mHead->numItems - 1)) { + /* last item */ + assert(mIter->lastInSegment == 1); + assert(mIter->lastItemBucket == 1); + assert(mIter->next == NULL); + } else { + assert(mIter->lastInSegment == 0); + assert(mIter->lastItemBucket == 0); + assert(mIter->next != NULL); + mIterPrev = mIter; + iter = mIter->next; + } + } +} + +/*** Static functions of ebuckets / rax ***/ + +static uint64_t *ebRaxNumItems(rax *rax) { + return (uint64_t*) rax->metadata; +} + +/* Allocate a single segment with a single item */ +static void ebNewBucket(EbucketsType *type, EBucketNew *newBucket, eItem item, uint64_t key) { + ExpireMeta *mItem = type->getExpireMeta(item); + + newBucket->segment.head = item; + newBucket->segment.totalItems = 1; + newBucket->segment.numSegs = 1; + newBucket->mLast = type->getExpireMeta(item); + newBucket->ebKey = key; + mItem->numItems = 1; + mItem->firstItemBucket = 1; + mItem->lastInSegment = 1; + mItem->lastItemBucket = 1; + mItem->next = &newBucket->segment; +} + +/* + * ebBucketPrint - Prints all the segments in the bucket and time expiration + * of each item in the following fashion: + * + * Bucket(tot=0008,sgs=0001) : [11, 21, 26, 27, 29, 49, 59, 62] + * Bucket(tot=0007,sgs=0001) : [67, 86, 90, 92, 115, 123, 126] + * Bucket(tot=0005,sgs=0001) : [130, 135, 135, 136, 140] + * Bucket(tot=0009,sgs=0002) : [182] + * [162, 163, 167, 168, 172, 177, 183, 186] + * Bucket(tot=0001,sgs=0001) : [193] + */ +static int ebBucketPrint(uint64_t bucketKey, EbucketsType *type, FirstSegHdr *firstSeg) { + eItem iter; + ExpireMeta *mIter, *mHead; + static int PRINT_EXPIRE_META_FLAGS=0; + + iter = firstSeg->head; + mHead = type->getExpireMeta(iter); + + printf("Bucket(key=%06" PRIu64 ",tot=%04d,sgs=%04d) :", bucketKey, firstSeg->totalItems, firstSeg->numSegs); + while (1) { + mIter = type->getExpireMeta(iter); /* not really needed. Just to hash the compiler */ + printf(" ["); + for (int i = 0; i < mHead->numItems ; ++i) { + mIter = type->getExpireMeta(iter); + uint64_t expireTime = ebGetMetaExpTime(mIter); + + if (i == 0 && PRINT_EXPIRE_META_FLAGS) + printf("%" PRIu64 ", ", + expireTime, mIter->numItems, mIter->firstItemBucket, + mIter->lastInSegment, mIter->lastItemBucket); + else if (i == (mHead->numItems - 1) && PRINT_EXPIRE_META_FLAGS) { + printf("%" PRIu64 "", + expireTime, mIter->numItems, mIter->firstItemBucket, + mIter->lastInSegment, mIter->lastItemBucket); + } else + printf("%" PRIu64 "%s", expireTime, (i == mHead->numItems - 1) ? "" : ", "); + + iter = mIter->next; + } + + if (mIter->lastItemBucket) { + printf("]\n"); + break; + } + printf("]\n "); + iter = ((NextSegHdr *) mIter->next)->head; + mHead = type->getExpireMeta(iter); + + } + return 0; +} + +/* Add another eItem to bucket. If needed return 'newBucket' for insertion in rax tree. + * + * 1) If the bucket is based on a single, not full segment, then add the item to the segment. + * 2) If a single, full segment, then try to split it and then add the item. + * 3) If failed to split, then all items in the bucket have the same bucket-key. + * - If the new item has the same bucket-key, then extend the segment to + * be an extended-segment, if not already, and add the item to it. + * - If the new item has a different bucket-key, then allocate a new bucket + * for it. + */ +static int ebAddToBucket(EbucketsType *type, + FirstSegHdr *firstSegBkt, + eItem item, + EBucketNew *newBucket, + uint64_t *updateBucketKey) +{ + newBucket->segment.head = NULL; /* no new bucket as default */ + + if (firstSegBkt->numSegs == 1) { + /* If bucket is a single, not full segment, then add the item to the segment */ + if (firstSegBkt->totalItems < EB_SEG_MAX_ITEMS) + return ebSegAddAvail(type, firstSegBkt, item); + + /* If bucket is a single, full segment, and segment split succeeded */ + if (ebTrySegSplit(type, firstSegBkt, newBucket) == 1) { + /* The split got failed only because all items in the segment have the + * same bucket-key */ + ExpireMeta *mItem = type->getExpireMeta(item); + + /* Check which of the two segments the new item should be added to. Note that + * after the split, bucket-key of `newBucket` is bigger than bucket-key of + * `firstSegBkt`. That is `firstSegBkt` preserves its bucket-key value + * (and its location in rax tree) before the split */ + if (EB_BUCKET_KEY(ebGetMetaExpTime(type->getExpireMeta(item))) < newBucket->ebKey) { + return ebSegAddAvail(type, firstSegBkt, item); + } else { + /* Add the `item` to the new bucket */ + ebSegAddAvail(type, &(newBucket->segment), item); + + /* if new item is now last item in the segment, then update lastItemBucket */ + if (mItem->lastItemBucket) + newBucket->mLast = mItem; + return 0; + } + } + } + + /* If reached here, then either: + * (1) a bucket with multiple segments + * (2) Or, a single, full segment which failed to split. + * + * Either way, all items in the bucket have the same bucket-key value. Thus: + * (A) If 'item' has the same bucket-key as the ones in this bucket, then add it as well + * (B) Else, allocate a new bucket for it. + */ + + ExpireMeta *mHead = type->getExpireMeta(firstSegBkt->head); + ExpireMeta *mItem = type->getExpireMeta(item); + + uint64_t bucketKey = EB_BUCKET_KEY(ebGetMetaExpTime(mHead)); /* same for all items in the segment */ + uint64_t itemKey = EB_BUCKET_KEY(ebGetMetaExpTime(mItem)); + + if (bucketKey == itemKey) { + /* New item has the same bucket-key as the ones in this bucket, Add it as well */ + if (mHead->numItems < EB_SEG_MAX_ITEMS) + return ebSegAddAvail(type, firstSegBkt, item); /* Add item to first segment */ + else { + /* If a regular segment becomes extended-segment, then update the + * bucket-key to be aligned with the expiration-time of the items + * it contains */ + if (firstSegBkt->numSegs == 1) + *updateBucketKey = bucketKey; + + return ebSegAddExtended(type, firstSegBkt, item); /* Add item in a new segment */ + } + } else { + /* If the item cannot be added to the visited (extended-segment) bucket + * because it has a key not equal to bucket-key, then need to allocate a new + * bucket for the item. If the key of the item is below the bucket-key of + * the visited bucket, then the new item will be added to a new segment + * before it and the visited bucket key will be updated to accurately + * reflect the bucket-key of the (extended-segment) bucket */ + if (bucketKey > itemKey) + *updateBucketKey = bucketKey; + + ebNewBucket(type, newBucket, item, EB_BUCKET_KEY(ebGetMetaExpTime(mItem))); + return 0; + } +} + +/* + * Remove item from rax + * + * Return 1 if removed. Otherwise, return 0 + * + * Note: The function is optimized to remove items locally from segments without + * traversing rax tree or stepping long extended-segments. Therefore, it is + * assumed that the item is present in the bucket without verification. + * + * TODO: Written straightforward. Should be optimized to merge small segments. + */ +static int ebRemoveFromRax(ebuckets *eb, EbucketsType *type, eItem item) { + ExpireMeta *mItem = type->getExpireMeta(item); + rax *rax = ebGetRaxPtr(*eb); + + /* if item is the only one left in a single-segment bucket, then delete bucket */ + if (unlikely(mItem->firstItemBucket && mItem->lastItemBucket)) { + raxIterator ri; + raxStart(&ri, rax); + unsigned char raxKey[EB_KEY_SIZE]; + bucketKey2RaxKey(EB_BUCKET_KEY(ebGetMetaExpTime(mItem)), raxKey); + raxSeek(&ri, "<=", raxKey, EB_KEY_SIZE); + + if (raxNext(&ri) == 0) + return 0; /* not removed */ + + FirstSegHdr *segHdr = ri.data; + + if (segHdr->head != item) + return 0; /* not removed */ + + zfree(segHdr); + raxRemove(ri.rt, ri.key, EB_KEY_SIZE, NULL); + raxStop(&ri); + + /* If last bucket in rax, then delete the rax */ + if (rax->numele == 0) { + raxFree(rax); + *eb = NULL; + return 1; /* removed */ + } + } else if (mItem->numItems == 1) { + /* If the `item` is the only one in its segment, there must be additional + * items and segments in this bucket. If there weren't, the item would + * have been removed by the previous condition. */ + + if (mItem->firstItemBucket) { + /* If the first item/segment in extended-segments, then + * - Remove current segment (with single item) and promote next-segment to be first. + * - Update first item of next-segment to be firstItemBucket + * - Update `prevSeg` next-of-next segment to point new header of next-segment + * - Update FirstSegHdr to totalItems-1, numSegs-1 */ + NextSegHdr *nextHdr = mItem->next; + FirstSegHdr *firstHdr = (FirstSegHdr *) nextHdr->prevSeg; + firstHdr->head = nextHdr->head; + firstHdr->totalItems--; + firstHdr->numSegs--; + zfree(nextHdr); + eItem *iter = firstHdr->head; + ExpireMeta *mIter = type->getExpireMeta(iter); + mIter->firstItemBucket = 1; + while (mIter->lastInSegment == 0) { + iter = mIter->next; + mIter = type->getExpireMeta(iter); + } + if (mIter->lastItemBucket) + mIter->next = firstHdr; + else + ((NextSegHdr *) mIter->next)->prevSeg = (CommonSegHdr *) firstHdr; + + } else if (mItem->lastItemBucket) { + /* If last item/segment in bucket, then + * - promote previous segment to be last segment + * - Update FirstSegHdr to totalItems-1, numSegs-1 */ + NextSegHdr *currHdr = mItem->next; + CommonSegHdr *prevHdr = currHdr->prevSeg; + eItem iter = prevHdr->head; + ExpireMeta *mIter = type->getExpireMeta(iter); + while (mIter->lastInSegment == 0) { + iter = mIter->next; + mIter = type->getExpireMeta(iter); + } + currHdr->firstSeg->totalItems--; + currHdr->firstSeg->numSegs--; + mIter->next = prevHdr; + mIter->lastItemBucket = 1; + zfree(currHdr); + + } else { + /* item/segment is not the first or last item/segment. + * - Update previous segment to point next segment. + * - Update `prevSeg` of next segment + * - Update FirstSegHdr to totalItems-1, numSegs-1 */ + NextSegHdr *nextHdr = mItem->next; + NextSegHdr *currHdr = (NextSegHdr *) nextHdr->prevSeg; + CommonSegHdr *prevHdr = currHdr->prevSeg; + + ExpireMeta *mIter = type->getExpireMeta(prevHdr->head); + while (mIter->lastInSegment == 0) + mIter = type->getExpireMeta(mIter->next); + + mIter->next = nextHdr; + nextHdr->prevSeg = prevHdr; + nextHdr->firstSeg->totalItems--; + nextHdr->firstSeg->numSegs--; + zfree(currHdr); + + } + } else { + /* At least 2 items in current segment */ + if (mItem->numItems) { + /* If item is first item in segment (Must be numItems>1), then + * - Find segment header and update to point next item. + * - Let next inherit 'item' flags {firstItemBucket, numItems-1} + * - Update FirstSegHdr to totalItems-1 */ + ExpireMeta *mIter = mItem; + CommonSegHdr *currHdr; + while (mIter->lastInSegment == 0) + mIter = type->getExpireMeta(mIter->next); + if (mIter->lastItemBucket) + currHdr = (CommonSegHdr *) mIter->next; + else + currHdr = (CommonSegHdr *) ((NextSegHdr *) mIter->next)->prevSeg; + + if (mItem->firstItemBucket) + ((FirstSegHdr *) currHdr)->totalItems--; + else + ((NextSegHdr *) currHdr)->firstSeg->totalItems--; + + eItem *newHead = mItem->next; + ExpireMeta *mNewHead = type->getExpireMeta(newHead); + mNewHead->firstItemBucket = mItem->firstItemBucket; + mNewHead->numItems = mItem->numItems - 1; + currHdr->head = newHead; + + } else if (mItem->lastInSegment) { + /* If item is last in segment, then + * - find previous item and let it inherit (next, lastInSegment, lastItemBucket) + * - Find and update segment header to numItems-1 + * - Update FirstSegHdr to totalItems-1 */ + CommonSegHdr *currHdr; + if (mItem->lastItemBucket) + currHdr = (CommonSegHdr *) mItem->next; + else + currHdr = (CommonSegHdr *) ((NextSegHdr *) mItem->next)->prevSeg; + + ExpireMeta *mHead = type->getExpireMeta(currHdr->head); + mHead->numItems--; + ExpireMeta *mIter = mHead; + while (mIter->next != item) + mIter = type->getExpireMeta(mIter->next); + + mIter->next = mItem->next; + mIter->lastInSegment = mItem->lastInSegment; + mIter->lastItemBucket = mItem->lastItemBucket; + + if (mHead->firstItemBucket) + ((FirstSegHdr *) currHdr)->totalItems--; + else + ((NextSegHdr *) currHdr)->firstSeg->totalItems--; + + } else { + /* - Item is in the middle of segment. Find previous item and update to point next. + * - Find and Update segment header to numItems-1 + * - Update FirstSegHdr to totalItems-1 */ + ExpireMeta *mIter = mItem; + CommonSegHdr *currHdr; + while (mIter->lastInSegment == 0) + mIter = type->getExpireMeta(mIter->next); + if (mIter->lastItemBucket) + currHdr = (CommonSegHdr *) mIter->next; + else + currHdr = (CommonSegHdr *) ((NextSegHdr *) mIter->next)->prevSeg; + + ExpireMeta *mHead = type->getExpireMeta(currHdr->head); + mHead->numItems--; + mIter = mHead; + while (mIter->next != item) + mIter = type->getExpireMeta(mIter->next); + + mIter->next = mItem->next; + mIter->lastInSegment = mItem->lastInSegment; + mIter->lastItemBucket = mItem->lastItemBucket; + + if (mHead->firstItemBucket) + ((FirstSegHdr *) currHdr)->totalItems--; + else + ((NextSegHdr *) currHdr)->firstSeg->totalItems--; + } + } + *ebRaxNumItems(rax) -= 1; + return 1; /* removed */ +} + +int ebAddToRax(ebuckets *eb, EbucketsType *type, eItem item, uint64_t bucketKeyItem) { + EBucketNew newBucket; /* ebAddToBucket takes care to update newBucket.segment.head */ + raxIterator iter; + unsigned char raxKey[EB_KEY_SIZE]; + bucketKey2RaxKey(bucketKeyItem, raxKey); + rax *rax = ebGetRaxPtr(*eb); + raxStart(&iter,rax); + raxSeek(&iter, "<=", raxKey, EB_KEY_SIZE); + *ebRaxNumItems(rax) += 1; + /* If expireTime of the item is below the bucket-key of first bucket in rax, + * then need to add it as a new bucket at the beginning of the rax. */ + if(raxNext(&iter) == 0) { + FirstSegHdr *firstSegHdr = zmalloc(sizeof(FirstSegHdr)); + firstSegHdr->head = item; + firstSegHdr->totalItems = 1; + firstSegHdr->numSegs = 1; + + /* update last item to point on the segment header */ + ExpireMeta *metaItem = type->getExpireMeta(item); + metaItem->lastItemBucket = 1; + metaItem->lastInSegment = 1; + metaItem->firstItemBucket = 1; + metaItem->numItems = 1; + metaItem->next = firstSegHdr; + bucketKey2RaxKey(bucketKeyItem, raxKey); + raxInsert(rax, raxKey, EB_KEY_SIZE, firstSegHdr, NULL); + raxStop(&iter); + return 0; + } + + /* Add the new item into the first segment of the bucket that we found */ + uint64_t updateBucketKey = 0; + ebAddToBucket(type, iter.data, item, &newBucket, &updateBucketKey); + + /* If following the addition need to `updateBucketKey` of `foundBucket` in rax */ + if(unlikely(updateBucketKey && updateBucketKey != raxKey2BucketKey(iter.key))) { + raxRemove(iter.rt, iter.key, EB_KEY_SIZE, NULL); + bucketKey2RaxKey(updateBucketKey, raxKey); + raxInsert(iter.rt, raxKey, EB_KEY_SIZE, iter.data, NULL); + } + + /* If ebAddToBucket() returned a new bucket, then add the bucket to rax. + * + * This might happen when trying to add another item to a bucket that is: + * 1. A single, full segment. Will result in a bucket (segment) split. + * 2. Extended segment with a different bucket-key than the new item. + * Will result in a new bucket (of size 1) for the new item. + */ + if (newBucket.segment.head != NULL) { + /* Allocate segment header for the new bucket */ + FirstSegHdr *newSeg = zmalloc(sizeof(FirstSegHdr)); + /* Move the segment from 'newBucket' to allocated segment header */ + *newSeg = newBucket.segment; + /* Update 'next' of last item in segment to point to 'FirstSegHdr` */ + newBucket.mLast->next = newSeg; + /* Insert the new bucket to rax */ + bucketKey2RaxKey(newBucket.ebKey, raxKey); + raxInsert(iter.rt, raxKey, EB_KEY_SIZE, newSeg, NULL); + } + + raxStop(&iter); + return 0; +} + +/* Validate the general structure of the buckets in rax */ +static void ebValidateRax(rax *rax, EbucketsType *type) { + uint64_t numItemsTotal = 0; + raxIterator raxIter; + raxStart(&raxIter, rax); + raxSeek(&raxIter, "^", NULL, 0); + while (raxNext(&raxIter)) { + FirstSegHdr *firstSegHdr = raxIter.data; + eItem iter; + ExpireMeta *mIter, *mHead; + iter = firstSegHdr->head; + mHead = type->getExpireMeta(iter); + uint64_t numItemsBucket = 0, countSegments = 0; + + int extendedSeg = (firstSegHdr->numSegs > 1) ? 1 : 0; + void *segHdr = firstSegHdr; + + mIter = type->getExpireMeta(iter); + assert(mIter->firstItemBucket == 1); + while (1) { + uint64_t curBktKey, prevBktKey; + for (int i = 0; i < mHead->numItems ; ++i) { + assert(iter != NULL); + mIter = type->getExpireMeta(iter); + curBktKey = EB_BUCKET_KEY(ebGetMetaExpTime(mIter)); + + if (i == 0) { + assert(mIter->numItems > 0 && mIter->numItems <= EB_SEG_MAX_ITEMS); + prevBktKey = curBktKey; + } else { + assert( (extendedSeg && prevBktKey == curBktKey) || + (!extendedSeg && prevBktKey <= curBktKey) ); + assert(mIter->numItems == 0); + assert(mIter->firstItemBucket == 0); + prevBktKey = curBktKey; + } + + if (i == mHead->numItems - 1) + assert(mIter->lastInSegment == 1); + else + assert(mIter->lastInSegment == 0); + + iter = mIter->next; + } + + numItemsBucket += mHead->numItems; + countSegments += 1; + + if (mIter->lastItemBucket) + break; + + NextSegHdr *nextSegHdr = mIter->next; + assert(nextSegHdr->firstSeg == firstSegHdr); + assert(nextSegHdr->prevSeg == segHdr); + iter = nextSegHdr->head; + mHead = type->getExpireMeta(iter); + segHdr = nextSegHdr; + } + /* Verify next of last item, `totalItems` and `numSegs` in iterated bucket */ + assert(mIter->next == segHdr); + assert(numItemsBucket == firstSegHdr->totalItems); + assert(countSegments == firstSegHdr->numSegs); + numItemsTotal += numItemsBucket; + } + raxStop(&raxIter); + assert(numItemsTotal == *ebRaxNumItems(rax)); +} + +struct deleteCbCtx { EbucketsType *type; void *userCtx; }; +void ebRaxDeleteCb(void *item, void *context) { + struct deleteCbCtx *ctx = context; + FirstSegHdr *firstSegHdr = item; + eItem itemIter = firstSegHdr->head; + uint32_t numSegs = firstSegHdr->numSegs; + void *nextSegHdr = firstSegHdr; + + for (uint32_t seg=0 ; seg < numSegs ; seg++) { + zfree(nextSegHdr); + + ExpireMeta *mIter = ctx->type->getExpireMeta(itemIter); + uint32_t numItemsInSeg = mIter->numItems; + + for (uint32_t i = 0; i < numItemsInSeg ; ++i) { + mIter = ctx->type->getExpireMeta(itemIter); + eItem toDelete = itemIter; + mIter->trash = 1; + itemIter = mIter->next; + if (ctx->type->onDeleteItem) ctx->type->onDeleteItem(toDelete, &ctx->userCtx); + } + nextSegHdr = itemIter; + + if (seg + 1 < numSegs) + itemIter = ((NextSegHdr *) nextSegHdr)->head; + } + +} + +static void _ebPrint(ebuckets eb, EbucketsType *type, int64_t usedMem, int printItems) { + if (ebIsEmpty(eb)) { + printf("Empty ebuckets\n"); + return; + } + + if (ebIsList(eb)) { + /* mock rax segment */ + eItem head = ebGetListPtr(type, eb); + ExpireMeta *metaHead = type->getExpireMeta(head); + FirstSegHdr mockSeg = { head, metaHead->numItems, 1}; + if (printItems) + ebBucketPrint(0, type, &mockSeg); + return; + } + + uint64_t totalItems = 0; + uint64_t numBuckets = 0; + uint64_t numSegments = 0; + + rax *rax = ebGetRaxPtr(eb); + raxIterator iter; + raxStart(&iter, rax); + raxSeek(&iter, "^", NULL, 0); + while (raxNext(&iter)) { + FirstSegHdr *seg = iter.data; + if (printItems) + ebBucketPrint(raxKey2BucketKey(iter.key), type, seg); + totalItems += seg->totalItems; + numBuckets++; + numSegments += seg->numSegs; + } + + printf("Total number of items : %" PRIu64 "\n", totalItems); + printf("Total number of buckets : %" PRIu64 "\n", numBuckets); + printf("Total number of segments : %" PRIu64 "\n", numSegments); + printf("Average items per bucket : %.2f\n", + (double) totalItems / numBuckets); + printf("Average items per segment : %.2f\n", + (double) totalItems / numSegments); + printf("Average segments per bucket : %.2f\n", + (double) numSegments / numBuckets); + + if (usedMem != -1) + { + printf("\nEbuckets memory usage (including FirstSegHdr/NexSegHdr):\n"); + printf("Total : %.2f KBytes\n", + (double) usedMem / 1024); + printf("Average per bucket : %" PRIu64 " Bytes\n", + usedMem / numBuckets); + printf("Average per item : %" PRIu64 " Bytes\n", + usedMem / totalItems); + printf("EB_BUCKET_KEY_PRECISION : %d\n", + EB_BUCKET_KEY_PRECISION); + printf("EB_SEG_MAX_ITEMS : %d\n", + EB_SEG_MAX_ITEMS); + } + raxStop(&iter); +} + +/*** API functions ***/ + +/** + * Deletes all items from given ebucket, invoking optional item deletion callbacks. + * + * @param eb - The ebucket to be deleted. + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * @param ctx - A context pointer that can be used in optional item deletion callbacks. + */ +void ebDestroy(ebuckets *eb, EbucketsType *type, void *ctx) { + if (ebIsEmpty(*eb)) + return; + + if (ebIsList(*eb)) { + eItem head = ebGetListPtr(type, *eb); + eItem *pItemNext = &head; + while ( (*pItemNext) != NULL) { + eItem toDelete = *pItemNext; + ExpireMeta *metaToDelete = type->getExpireMeta(toDelete); + *pItemNext = metaToDelete->next; + metaToDelete->trash = 1; + if (type->onDeleteItem) type->onDeleteItem(toDelete, ctx); + } + } else { + struct deleteCbCtx deleteCtx = {type, ctx}; + raxFreeWithCbAndContext(ebGetRaxPtr(*eb), ebRaxDeleteCb, &deleteCtx); + } + + *eb = NULL; +} + +/** + * Removes the specified item from the given ebucket, updating the ebuckets handler + * accordingly. The function is optimized to remove items locally from segments + * without traversing rax tree or stepping long extended-segments. Therefore, + * it is assumed that the item is present in the bucket without verification. + * + * @param eb - Pointer to the ebuckets handler, which may get updated if the removal + * affects the structure. + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * @param item - The eItem to be removed from the ebucket. + * + * @return 1 if the item was successfully removed; otherwise, return 0. + */ +int ebRemove(ebuckets *eb, EbucketsType *type, eItem item) { + + if (ebIsEmpty(*eb)) + return 0; /* not removed */ + + int res; + if (ebIsList(*eb)) + res = ebRemoveFromList(eb, type, item); + else /* rax */ + res = ebRemoveFromRax(eb, type, item); + + /* if removed then mark as trash */ + if (res) + type->getExpireMeta(item)->trash = 1; + +#if (REDIS_TEST || EB_VALIDATE_DEBUG) && !defined(EB_TEST_BENCHMARK) + ebValidate(*eb, type); +#endif + return res; +} + +/** + * Adds the specified item to the ebucket structure based on expiration time. + * If the ebucket is a list or empty, it attempts to add the item to the list. + * Otherwise, it adds the item to rax. If the list reaches its maximum size, it + * is converted to rax. The ebuckets handler may be updated accordingly. + * + * @param eb - Pointer to the ebuckets handler, which may get updated + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * @param item - The eItem to be added to the ebucket. + * @param expireTime - The expiration time of the item. + * + * @return 1 if the item was successfully added; Otherwise, return 0 on failure. + */ +int ebAdd(ebuckets *eb, EbucketsType *type, eItem item, uint64_t expireTime) { + int res; + + assert(expireTime <= EB_EXPIRE_TIME_MAX); + + /* Set expire-time and reset segment flags */ + ExpireMeta *itemMeta = type->getExpireMeta(item); + ebSetMetaExpTime(itemMeta, expireTime); + itemMeta->lastInSegment = 0; + itemMeta->firstItemBucket = 0; + itemMeta->lastItemBucket = 0; + itemMeta->numItems = 0; + itemMeta->trash = 0; + + if (ebIsList(*eb) || (ebIsEmpty(*eb))) { + /* Try add item to list */ + if ( (res = ebAddToList(eb, type, item)) == 1) { + /* Failed to add since list reached maximum size. Convert to rax */ + *eb = ebConvertListToRax(ebGetListPtr(type, *eb), type); + res = ebAddToRax(eb, type, item, EB_BUCKET_KEY(expireTime)); + } + } else { + /* Add item to rax */ + res = ebAddToRax(eb, type, item, EB_BUCKET_KEY(expireTime)); + } +#if (REDIS_TEST || EB_VALIDATE_DEBUG) && !defined(EB_TEST_BENCHMARK) + ebValidate(*eb, type); +#endif + return res; +} + +/** + * Performs expiration on the given ebucket, removing items that have expired. + * + * If all items in the data structure are expired, 'eb' will be set to NULL. + * + * @param eb - Pointer to the ebuckets handler, which may get updated + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * @param info - Providing information about the expiration action. + */ +void ebExpire(ebuckets *eb, EbucketsType *type, ExpireInfo *info) { + /* updateList - maintain a list of expired items that the callback `onExpireItem` + * indicated to update their expiration time rather than removing them. + * At the end of this function, `updateList` will be `ebAdd()` back. */ + eItem updateList = NULL; + + /* reset info outputs */ + info->nextExpireTime = 0; + info->itemsExpired = 0; + + /* if empty ebuckets */ + if (ebIsEmpty(*eb)) return; + + if (ebIsList(*eb)) { + ebListExpire(eb, type, info, &updateList); + goto END_ACTEXP; + } + + /* handle rax expiry */ + + rax *rax = ebGetRaxPtr(*eb); + raxIterator iter; + + raxStart(&iter, rax); + + uint64_t nowKey = EB_BUCKET_KEY(info->now); + uint64_t itemsExpiredBefore = info->itemsExpired; + + while (1) { + raxSeek(&iter,"^",NULL,0); + if (!raxNext(&iter)) break; + + uint64_t bucketKey = raxKey2BucketKey(iter.key); + + FirstSegHdr *firstSegHdr = iter.data; + + /* We need to take into consideration EB_BUCKET_KEY_PRECISION. The value of + * "info->now" will be adjusted to lookup only for all buckets with assigned + * keys that are older than 1<now). */ + if (bucketKey >= nowKey) { + /* Take care to update next expire time based on next segment to expire */ + info->nextExpireTime = ebGetMetaExpTime( + type->getExpireMeta(firstSegHdr->head)); + break; + } + + /* If not managed to remove entire bucket then return */ + if (ebSegExpire(firstSegHdr, type, info, &updateList) == 0) + break; + + raxRemove(iter.rt, iter.key, EB_KEY_SIZE, NULL); + } + + raxStop(&iter); + *ebRaxNumItems(rax) -= info->itemsExpired - itemsExpiredBefore; + + if(raxEOF(&iter) && (updateList == 0)) { + raxFree(rax); + *eb = NULL; + } + +END_ACTEXP: + /* Add back items with updated expiration time */ + while (updateList) { + ExpireMeta *mItem = type->getExpireMeta(updateList); + eItem next = mItem->next; + ebAdd(eb, type, updateList, ebGetMetaExpTime(mItem)); + updateList = next; + } +#if (REDIS_TEST || EB_VALIDATE_DEBUG) && !defined(EB_TEST_BENCHMARK) + ebValidate(*eb, type); +#endif + return; +} + +/* Performs active expiration dry-run to evaluate number of expired items + * + * It is faster than actual active-expire because it iterates only over the + * headers of the buckets until the first non-expired bucket, and no more than + * EB_SEG_MAX_ITEMS items in the last bucket + * + * @param eb - The ebucket to be checked. + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * @param now - The current time in milliseconds. + */ +uint64_t ebExpireDryRun(ebuckets eb, EbucketsType *type, uint64_t now) { + if (ebIsEmpty(eb)) return 0; + + uint64_t numExpired = 0; + + /* If list, then iterate and count expired ones */ + if (ebIsList(eb)) { + ExpireMeta *mIter = type->getExpireMeta(ebGetListPtr(type, eb)); + while (1) { + if (ebGetMetaExpTime(mIter) >= now) + return numExpired; + + numExpired++; + + if (mIter->lastInSegment) + return numExpired; + + mIter = type->getExpireMeta(mIter->next); + } + } + + /* Handle rax active-expire */ + rax *rax = ebGetRaxPtr(eb); + raxIterator iter; + raxStart(&iter, rax); + uint64_t nowKey = EB_BUCKET_KEY(now); + raxSeek(&iter,"^",NULL,0); + assert(raxNext(&iter)); /* must be at least one bucket */ + FirstSegHdr *currBucket = iter.data; + + while (1) { + /* if 'currBucket' is last bucket, then break */ + if(!raxNext(&iter)) break; + FirstSegHdr *nextBucket = iter.data; + + /* if 'nextBucket' is not less than now then break */ + if (raxKey2BucketKey(iter.key) >= nowKey) break; + + /* nextBucket less than now. For sure all items in currBucket are expired */ + numExpired += currBucket->totalItems; + currBucket = nextBucket; + } + raxStop(&iter); + + /* If single segment bucket, iterate over items and count expired ones */ + if (currBucket->numSegs == 1) { + ExpireMeta *mIter = type->getExpireMeta(currBucket->head); + while (1) { + if (ebGetMetaExpTime(mIter) >= now) + return numExpired; + + numExpired++; + + if (mIter->lastInSegment) + return numExpired; + + mIter = type->getExpireMeta(mIter->next); + } + } + + /* Bucket key exactly reflect expiration time of all items (currBucket->numSegs > 1) */ + if (EB_BUCKET_KEY_PRECISION == 0) { + if (ebGetMetaExpTime(type->getExpireMeta(currBucket->head)) >= now) + return numExpired; + else + return numExpired + currBucket->totalItems; + } + + /* Iterate extended-segment and count expired ones */ + + /* Unreachable code, provided for completeness. Following operation is not + * bound in time and this is the main reason why we set above + * EB_BUCKET_KEY_PRECISION to 0 and have early return on previous condition */ + + ExpireMeta *mIter = type->getExpireMeta(currBucket->head); + while(1) { + if (ebGetMetaExpTime(mIter) < now) + numExpired++; + + if (mIter->lastItemBucket) + return numExpired; + + if (mIter->lastInSegment) + mIter = type->getExpireMeta(((NextSegHdr *) mIter->next)->head); + else + mIter = type->getExpireMeta(mIter->next); + } +} + +/** + * Retrieves the expiration time of the item with the nearest expiration + * + * @param eb - The ebucket to be checked. + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * + * @return The expiration time of the item with the nearest expiration time in + * the ebucket. If empty, return EB_EXPIRE_TIME_INVALID. If ebuckets is + * of type rax and minimal bucket is extended-segment, then it might not + * return accurate result up-to 1<getExpireMeta(ebGetListPtr(type, eb))); + + /* rax */ + uint64_t minExpire; + rax *rax = ebGetRaxPtr(eb); + raxIterator iter; + raxStart(&iter, rax); + raxSeek(&iter, "^", NULL, 0); + raxNext(&iter); /* seek to the last bucket */ + FirstSegHdr *firstSegHdr = iter.data; + if ((firstSegHdr->numSegs == 1) || (EB_BUCKET_KEY_PRECISION == 0)) { + /* Single segment, or extended-segments that all have same expiration time. + * return the first item with the nearest expiration time */ + minExpire = ebGetMetaExpTime(type->getExpireMeta(firstSegHdr->head)); + } else { + + /* If reached here, then it is because it is extended segment and buckets + * are with lower precision than 1msec. In that case it is better not to + * iterate extended-segments, which might be unbounded, and just return + * worst possible expiration time in this bucket. + * + * The reason we return blindly worst case expiration time value in this + * bucket is because the only usage of this function is to figure out + * when is the next time active expiration should be performed, and it + * is better to do it only after 1 or more items were expired and not the + * other way around. + */ + uint64_t expTime = ebGetMetaExpTime(type->getExpireMeta(firstSegHdr->head)); + minExpire = expTime | ( (1<getExpireMeta(item); + while (em->lastInSegment == 0) + em = type->getExpireMeta(em->next); + return ebGetMetaExpTime(em); + } + + /* rax */ + uint64_t maxExpire; + rax *rax = ebGetRaxPtr(eb); + raxIterator iter; + raxStart(&iter, rax); + raxSeek(&iter, "$", NULL, 0); + raxNext(&iter); /* seek to the last bucket */ + FirstSegHdr *firstSegHdr = iter.data; + if (firstSegHdr->numSegs == 1) { + /* Single segment. return the last item with the highest expiration time */ + ExpireMeta *em = type->getExpireMeta(firstSegHdr->head); + while (em->lastInSegment == 0) + em = type->getExpireMeta(em->next); + maxExpire = ebGetMetaExpTime(em); + } else if (EB_BUCKET_KEY_PRECISION == 0) { + /* Extended-segments that all have same expiration time */ + maxExpire = ebGetMetaExpTime(type->getExpireMeta(firstSegHdr->head)); + } else { + if (accurate == 0) { + /* return upper limit of the last bucket */ + int mask = (1<getExpireMeta(firstSegHdr->head)); + maxExpire = (expTime + (mask+1)) & (~mask); + } else { + maxExpire = 0; + ExpireMeta *mIter = type->getExpireMeta(firstSegHdr->head); + while(1) { + while(1) { + if (maxExpire < ebGetMetaExpTime(mIter)) + maxExpire = ebGetMetaExpTime(mIter); + if (mIter->lastInSegment == 1) break; + mIter = type->getExpireMeta(mIter->next); + } + + if (mIter->lastItemBucket) break; + mIter = type->getExpireMeta(((NextSegHdr *) mIter->next)->head); + } + } + } + raxStop(&iter); + return maxExpire; +} + +/** + * Retrieves the total number of items in the ebucket. + */ +uint64_t ebGetTotalItems(ebuckets eb, EbucketsType *type) { + if (ebIsEmpty(eb)) + return 0; + + if (ebIsList(eb)) + return type->getExpireMeta(ebGetListPtr(type, eb))->numItems; + else + return *ebRaxNumItems(ebGetRaxPtr(eb)); +} + +/* print expiration-time of items, ebuckets layout and some statistics */ +void ebPrint(ebuckets eb, EbucketsType *type) { + _ebPrint(eb, type, -1, 1); +} + +/* Validate the general structure of ebuckets. Calls assert(0) on error. */ +void ebValidate(ebuckets eb, EbucketsType *type) { + if (ebIsEmpty(eb)) + return; + + if (ebIsList(eb)) + ebValidateList(ebGetListPtr(type, eb), type); + else + ebValidateRax(ebGetRaxPtr(eb), type); +} + +/* Retrieves the expiration time associated with the given item. If associated + * ExpireMeta is marked as trash, then return EB_EXPIRE_TIME_INVALID */ +uint64_t ebGetExpireTime(EbucketsType *type, eItem item) { + ExpireMeta *meta = type->getExpireMeta(item); + if (unlikely(meta->trash)) return EB_EXPIRE_TIME_INVALID; + return ebGetMetaExpTime(meta); +} + +/*** Unit tests ***/ + +#ifdef REDIS_TEST +#include +#include +#include +#include "testhelp.h" + +#define TEST(name) printf("[TEST] >>> %s\n", name); +#define TEST_COND(name, cond) printf("[%s] >>> %s\n", (cond) ? "TEST" : "BYPS", name); if (cond) + +typedef struct MyItem { + ExpireMeta mexpire; +} MyItem; + +typedef struct TimeRange { + uint64_t start; + uint64_t end; +} TimeRange; + +ExpireMeta *getMyItemExpireMeta(const eItem item) { + return &((MyItem *)item)->mexpire; +} + +ExpireAction toExpireItemCb(void *ctx, eItem item); +void toDeleteItemCb(eItem item, void *ctx); +EbucketsType myEbucketsType = { + .getExpireMeta = getMyItemExpireMeta, + .onDeleteItem = toDeleteItemCb, + .itemsAddrAreOdd = 0, +}; + +EbucketsType myEbucketsType2 = { + .getExpireMeta = getMyItemExpireMeta, + .onDeleteItem = NULL, + .itemsAddrAreOdd = 0, +}; + +/* XOR over all items time-expiration. Must be 0 after all addition/removal */ +uint64_t expItemsHashValue = 0; + +ExpireAction toExpireItemCb(eItem item, void *ctx) { + ExpireMeta *meta = myEbucketsType.getExpireMeta(item); + uint64_t expTime = ebGetMetaExpTime(meta); + expItemsHashValue = expItemsHashValue ^ expTime; + + TimeRange *range = (TimeRange *) ctx; + /* Verify expiration time is within the range */ + if (range != NULL) assert(expTime >= range->start && expTime <= range->end); + +/* If benchmarking then avoid from heavyweight free operation. It is user side logic */ +#ifndef EB_TEST_BENCHMARK + zfree(item); +#endif + return ACT_REMOVE_EXP_ITEM; +} + +void toDeleteItemCb(eItem item, void *ctx) { + UNUSED(ctx); + zfree(item); +} + +void addItems(ebuckets *eb, uint64_t startExpire, int step, uint64_t numItems, MyItem **ar) { + for (uint64_t i = 0 ; i < numItems ; i++) { + uint64_t expireTime = startExpire + (i * step); + expItemsHashValue = expItemsHashValue ^ expireTime; + MyItem *item = zmalloc(sizeof(MyItem)); + if (ar) ar[i] = item; + ebAdd(eb, &myEbucketsType, item, expireTime); + } +} + +/* expireRanges - is given as bucket-key to be agnostic to the different configuration + * of EB_BUCKET_KEY_PRECISION */ +void distributeTest(int lowestTime, + uint64_t *expireRanges, + const int *ItemsPerRange, + int numRanges, + int isExpire, + int printStat) { + struct timeval timeBefore, timeAfter, timeDryRun, timeCreation, timeDestroy; + ebuckets eb = ebCreate(); + + /* create items with random expiry */ + uint64_t startRange = lowestTime; + + expItemsHashValue = 0; + void *listOfItems = NULL; + for (int i = 0; i < numRanges; i++) { + uint64_t endRange = EB_BUCKET_EXP_TIME(expireRanges[i]); + for (int j = 0; j < ItemsPerRange[i]; j++) { + uint64_t randomExpirey = (rand() % (endRange - startRange)) + startRange; + expItemsHashValue = expItemsHashValue ^ (uint32_t) randomExpirey; + MyItem *item = zmalloc(sizeof(MyItem)); + getMyItemExpireMeta(item)->next = listOfItems; + listOfItems = item; + ebSetMetaExpTime(getMyItemExpireMeta(item), randomExpirey); + } + startRange = EB_BUCKET_EXP_TIME(expireRanges[i]); /* next start range */ + } + + /* Take to sample memory after all items allocated and before insertion to ebuckets */ + size_t usedMemBefore = zmalloc_used_memory(); + + gettimeofday(&timeBefore, NULL); + while (listOfItems) { + MyItem *item = (MyItem *)listOfItems; + listOfItems = getMyItemExpireMeta(item)->next; + uint64_t expireTime = ebGetMetaExpTime(&item->mexpire); + ebAdd(&eb, &myEbucketsType, item, expireTime); + } + gettimeofday(&timeAfter, NULL); + timersub(&timeAfter, &timeBefore, &timeCreation); + + gettimeofday(&timeBefore, NULL); + ebExpireDryRun(eb, &myEbucketsType, 0xFFFFFFFFFFFF); /* expire dry-run all */ + gettimeofday(&timeAfter, NULL); + timersub(&timeAfter, &timeBefore, &timeDryRun); + + if (printStat) { + _ebPrint(eb, &myEbucketsType, zmalloc_used_memory() - usedMemBefore, 0); + } + + gettimeofday(&timeBefore, NULL); + if (isExpire) { + startRange = lowestTime; + /* Active expire according to the ranges */ + for (int i = 0 ; i < numRanges ; i++) { + + /* When checking how many items are expired, we need to take into + * consideration EB_BUCKET_KEY_PRECISION. The value of "info->now" + * will be adjusted by ebActiveExpire() to lookup only for all buckets + * with assigned keys that are older than 1<now) and not "<=". + * But if there is a list behind ebuckets, then this limitation is not + * applied and the operator "<=" will be used instead. + * + * The '-1' in case of list brings makes both cases aligned to have + * same result */ + uint64_t now = EB_BUCKET_EXP_TIME(expireRanges[i]) + (ebIsList(eb) ? -1 : 0); + + TimeRange range = {EB_BUCKET_EXP_TIME(startRange), EB_BUCKET_EXP_TIME(expireRanges[i]) }; + ExpireInfo info = { + .maxToExpire = 0xFFFFFFFF, + .onExpireItem = toExpireItemCb, + .ctx = &range, + .now = now, + .itemsExpired = 0}; + + ebExpire(&eb, &myEbucketsType, &info); + + assert( (eb==NULL && (i + 1 == numRanges)) || (eb!=NULL && (i + 1 < numRanges)) ); + assert( info.itemsExpired == (uint64_t) ItemsPerRange[i]); + startRange = expireRanges[i]; + } + assert(eb == NULL); + assert( (expItemsHashValue & 0xFFFFFFFF) == 0); + } + ebDestroy(&eb, &myEbucketsType, NULL); + gettimeofday(&timeAfter, NULL); + timersub(&timeAfter, &timeBefore, &timeDestroy); + + if (printStat) { + printf("Time elapsed ebuckets creation : %ld.%06ld\n", (long int)timeCreation.tv_sec, (long int)timeCreation.tv_usec); + printf("Time elapsed active-expire dry-run : %ld.%06ld\n", (long int)timeDryRun.tv_sec, (long int)timeDryRun.tv_usec); + if (isExpire) + printf("Time elapsed active-expire : %ld.%06ld\n", (long int)timeDestroy.tv_sec, (long int)timeDestroy.tv_usec); + else + printf("Time elapsed destroy : %ld.%06ld\n", (long int)timeDestroy.tv_sec, (long int)timeDestroy.tv_usec); + } + +} + +#define UNUSED(x) (void)(x) +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) + +int ebucketsTest(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + srand(0); + + int verbose = (flags & REDIS_TEST_VERBOSE) ? 2 : 1; + UNUSED(verbose); + +#ifdef EB_TEST_BENCHMARK + TEST("ebuckets - benchmark 10 million items: alloc + add + activeExpire") { + + struct TestParams { + uint64_t minExpire; + uint64_t maxExpire; + int items; + const char *description; + } testCases[] = { + { 1805092100000, 1805092100000 + (uint64_t) 1, 10000000, "1 msec distribution" }, + { 1805092100000, 1805092100000 + (uint64_t) 1000, 10000000, "1 sec distribution" }, + { 1805092100000, 1805092100000 + (uint64_t) 1000*60, 10000000, "1 min distribution" }, + { 1805092100000, 1805092100000 + (uint64_t) 1000*60*60, 10000000, "1 hour distribution" }, + { 1805092100000, 1805092100000 + (uint64_t) 1000*60*60*24, 10000000, "1 day distribution" }, + { 1805092100000, 1805092100000 + (uint64_t) 1000*60*60*24*7, 10000000, "1 week distribution" }, + { 1805092100000, 1805092100000 + (uint64_t) 1000*60*60*24*30, 10000000, "1 month distribution" } + }; + + /* selected test */ + uint32_t tid = 3; + + printf("\n------ TEST EBUCKETS: %s ------\n", testCases[tid].description); + uint64_t expireRanges[] = { testCases[tid].minExpire, testCases[tid].maxExpire }; + int itemsPerRange[] = { 0, testCases[tid].items }; + + /* expireRanges[] is provided to distributeTest() as bucket-key values */ + for (uint32_t j = 0; j < ARRAY_SIZE(expireRanges); ++j) { + expireRanges[j] = expireRanges[j] >> EB_BUCKET_KEY_PRECISION; + } + + distributeTest(0, expireRanges, itemsPerRange, ARRAY_SIZE(expireRanges), 1, 1); + return 0; + } +#endif + + TEST("list - Create a single item, get TTL, and remove") { + MyItem *singleItem = zmalloc(sizeof(MyItem)); + ebuckets eb = NULL; + ebAdd(&eb, &myEbucketsType, singleItem, 1000); + assert(ebGetExpireTime(&myEbucketsType, singleItem) == 1000 ); + + /* remove the item */ + assert(ebRemove(&eb, &myEbucketsType, singleItem)); + /* now the ebuckets is empty */ + assert(ebRemove(&eb, &myEbucketsType, singleItem) == 0); + + zfree(singleItem); + + ebDestroy(&eb, &myEbucketsType, NULL); + } + + TEST("list - Create few items on different times, get TTL, and then remove") { + MyItem *items[EB_LIST_MAX_ITEMS]; + ebuckets eb = NULL; + for (int i = 0 ; i < EB_LIST_MAX_ITEMS ; i++) { + items[i] = zmalloc(sizeof(MyItem)); + ebAdd(&eb, &myEbucketsType, items[i], i); + } + + for (uint64_t i = 0 ; i < EB_LIST_MAX_ITEMS ; i++) { + assert(ebGetExpireTime(&myEbucketsType, items[i]) == i ); + assert(ebRemove(&eb, &myEbucketsType, items[i])); + } + + for (int i = 0 ; i < EB_LIST_MAX_ITEMS ; i++) { + zfree(items[i]); + } + + ebDestroy(&eb, &myEbucketsType, NULL); + } + + TEST("list - Create few items on different times, get TTL, and then delete") { + MyItem *items[EB_LIST_MAX_ITEMS]; + ebuckets eb = NULL; + for (int i = 0 ; i < EB_LIST_MAX_ITEMS ; i++) { + items[i] = zmalloc(sizeof(MyItem)); + ebAdd(&eb, &myEbucketsType, items[i], i); + } + + for (uint64_t i = 0 ; i < EB_LIST_MAX_ITEMS ; i++) { + assert(ebGetExpireTime(&myEbucketsType, items[i]) == i ); + } + + ebDestroy(&eb, &myEbucketsType, NULL); + } + + TEST_COND("ebuckets - Add items with increased/decreased expiration time and then expire", + EB_BUCKET_KEY_PRECISION > 0) + { + ebuckets eb = NULL; + + for (int isDecr = 0; isDecr < 2; ++isDecr) { + for (uint32_t numItems = 1; numItems < 64; ++numItems) { + uint64_t step = 1 << EB_BUCKET_KEY_PRECISION; + + if (isDecr == 0) + addItems(&eb, 0, step, numItems, NULL); + else + addItems(&eb, (numItems - 1) * step, -step, numItems, NULL); + + for (uint32_t i = 1; i <= numItems; i++) { + TimeRange range = {EB_BUCKET_EXP_TIME(i - 1), EB_BUCKET_EXP_TIME(i)}; + ExpireInfo info = { + .maxToExpire = 1, + .onExpireItem = toExpireItemCb, + .ctx = &range, + .now = EB_BUCKET_EXP_TIME(i), + .itemsExpired = 0}; + + ebExpire(&eb, &myEbucketsType, &info); + assert(info.itemsExpired == 1); + if (i == numItems) { /* if last item */ + assert(eb == NULL); + assert(info.nextExpireTime == 0); + } else { + assert(info.nextExpireTime == EB_BUCKET_EXP_TIME(i)); + } + } + } + } + } + + TEST_COND("ebuckets - Create items with same expiration time and then expire", + EB_BUCKET_KEY_PRECISION > 0) + { + ebuckets eb = NULL; + uint64_t expirePerIter = 2; + for (uint32_t numIterations = 1; numIterations < 100; ++numIterations) { + uint32_t numItems = numIterations * expirePerIter; + uint64_t expireTime = (1 << EB_BUCKET_KEY_PRECISION) + 1; + addItems(&eb, expireTime, 0, numItems, NULL); + + for (uint32_t i = 1; i <= numIterations; i++) { + ExpireInfo info = { + .maxToExpire = expirePerIter, + .onExpireItem = toExpireItemCb, + .ctx = NULL, + .now = (2 << EB_BUCKET_KEY_PRECISION), + .itemsExpired = 0}; + ebExpire(&eb, &myEbucketsType, &info); + assert(info.itemsExpired == expirePerIter); + if (i == numIterations) { /* if last item */ + assert(eb == NULL); + assert(info.nextExpireTime == 0); + } else { + assert(info.nextExpireTime == expireTime); + } + } + } + } + + TEST("list - Create few items on random times and then expire/delete ") { + for (int isExpire = 0 ; isExpire <= 1 ; ++isExpire ) { + uint64_t expireRanges[] = {1000}; /* bucket-keys */ + int itemsPerRange[] = {EB_LIST_MAX_ITEMS}; + distributeTest(0, expireRanges, itemsPerRange, + ARRAY_SIZE(expireRanges), isExpire, 0); + } + } + + TEST("list - Create few items (list) on same time and then active expire/delete ") { + for (int isExpire = 0 ; isExpire <= 1 ; ++isExpire ) { + uint64_t expireRanges[] = {1, 2}; /* bucket-keys */ + int itemsPerRange[] = {0, EB_LIST_MAX_ITEMS}; + + distributeTest(0, expireRanges, itemsPerRange, + ARRAY_SIZE(expireRanges), isExpire, 0); + } + } + + TEST("ebuckets - Create many items on same time and then active expire/delete ") { + for (int isExpire = 1 ; isExpire <= 1 ; ++isExpire ) { + uint64_t expireRanges[] = {1, 2}; /* bucket-keys */ + int itemsPerRange[] = {0, 20}; + + distributeTest(0, expireRanges, itemsPerRange, + ARRAY_SIZE(expireRanges), isExpire, 0); + } + } + + TEST("ebuckets - Create items on different times and then expire/delete ") { + for (int isExpire = 0 ; isExpire <= 0 ; ++isExpire ) { + for (int numItems = 1 ; numItems < 100 ; ++numItems ) { + uint64_t expireRanges[] = {1000000}; /* bucket-keys */ + int itemsPerRange[] = {numItems}; + distributeTest(0, expireRanges, itemsPerRange, + ARRAY_SIZE(expireRanges), 1, 0); + } + } + } + + TEST("ebuckets - Create items on different times and then ebRemove() ") { + ebuckets eb = NULL; + + for (int step = -1 ; step <= 1 ; ++step) { + for (int numItems = 1; numItems <= EB_SEG_MAX_ITEMS*3; ++numItems) { + for (int offset = 0; offset < numItems; offset++) { + MyItem *items[numItems]; + uint64_t startValue = 1000 << EB_BUCKET_KEY_PRECISION; + int stepValue = step * (1 << EB_BUCKET_KEY_PRECISION); + addItems(&eb, startValue, stepValue, numItems, items); + for (int i = 0; i < numItems; i++) { + int at = (i + offset) % numItems; + assert(ebRemove(&eb, &myEbucketsType, items[at])); + zfree(items[at]); + } + assert(eb == NULL); + } + } + } + } + + TEST("ebuckets - test min/max expire time") { + ebuckets eb = NULL; + MyItem items[3*EB_SEG_MAX_ITEMS]; + for (int numItems = 1 ; numItems < (int)ARRAY_SIZE(items) ; numItems++) { + uint64_t minExpTime = RAND_MAX, maxExpTime = 0; + for (int i = 0; i < numItems; i++) { + /* generate random expiration time */ + uint64_t expireTime = rand(); + if (expireTime < minExpTime) minExpTime = expireTime; + if (expireTime > maxExpTime) maxExpTime = expireTime; + ebAdd(&eb, &myEbucketsType2, items + i, expireTime); + assert(ebGetNextTimeToExpire(eb, &myEbucketsType2) == minExpTime); + assert(ebGetMaxExpireTime(eb, &myEbucketsType2, 0) == maxExpTime); + } + ebDestroy(&eb, &myEbucketsType2, NULL); + } + } + + TEST_COND("ebuckets - test min/max expire time, with extended-segment", + (1< 2*EB_SEG_MAX_ITEMS) { + ebuckets eb = NULL; + MyItem items[(2*EB_SEG_MAX_ITEMS)-1]; + for (int numItems = EB_SEG_MAX_ITEMS+1 ; numItems < (int)ARRAY_SIZE(items) ; numItems++) { + /* First reach extended-segment (two chained segments in a bucket) */ + for (int i = 0; i <= EB_SEG_MAX_ITEMS; i++) { + uint64_t itemExpireTime = (1< size(Seg0) = 11 ==> [ 00-10 ] -> size(Seg0) = 11 + * [ 11-76 ] -> size(Seg1) = 16 [ 11-36 ] -> size(Seg1) = 9 + * [ 37-76 ] -> size(Seg2) = 7 + * + * EXTENDING BUCKET + * ---------------- + * In the example above, the reason it wasn't split evenly is that Seg1 must have + * been holding items with same TTL and they must reside together in the same + * bucket after the split. Which brings us to another important point. If there + * is a segment that reached its maximum capacity and all the items have same + * expiration-time key, then we cannot split the bucket but aggregate all the + * items, with same expiration time key, by allocating an extended-segment and + * chain it to the first segment in visited bucket. In that sense, extended + * segments will only hold items with same expiration-time key. + * + * BUCKETS BUCKETS + * [ 00-10 ] -> size(Seg0)=11 ==> [ 00-10 ] -> size(Seg0)=11 + * [ 11-12 ] -> size(Seg1)=16 [ 11-12 ] -> size(Seg1)=1 -> size(Seg2)=16 + * + * LIMITING RAX TREE DEPTH + * ----------------------- + * The rax tree is basically a B-tree and its depth is bounded by the sizeof of + * the key. Holding 6 bytes for expiration-time key is more than enough to represent + * unix-time in msec, and in turn the depth of the tree is limited to 6 levels. + * At a first glance it might look sufficient but we need take into consideration + * the heavyweight maintenance and traversal of each node in the B-tree. + * + * And so, we can further prune the tree such that holding keys with msec precision + * in the tree doesn't bring with it much value. The active-expiration operation can + * live with deletion of expired items, say, older than 1 sec, which means the size + * of time-expiration keys to the rax tree become no more than ~4.5 bytes and we + * also get rid of the "noisy" bits which most probably will cause to yet another + * branching and modification of the rax tree in case of items with time-expiration + * difference of less than 1 second. The lazy expiration will still be precise and + * without compromise on accuracy because the exact expiration-time is kept + * attached as well to each item, in `ExpireMeta`, and each traversal of item with + * expiration will behave as expected down to the msec. Take care to configure + * `EB_BUCKET_KEY_PRECISION` according to your needs. + * + * EBUCKET KEY + * ----------- + * Taking into account configured value of `EB_BUCKET_KEY_PRECISION`, two items + * with expiration-time t1 and t2 will be considered to have the same key in the + * rax-tree/buckets if and only if: + * + * EB_BUCKET_KEY(t1) == EB_BUCKET_KEY(t2) + * + * EBUCKETS CREATION + * ----------------- + * To avoid the cost of allocating rax data-structure for only few elements, + * ebuckets will start as a simple linked-list and only when it reaches some + * threshold, it will be converted to rax. + * + * TODO + * ---- + * - ebRemove() optimize to merge small segments into one segment. + * - ebAdd() Fix pathological case of cascade addition of items into rax such + * that their values are smaller/bigger than visited extended-segment which ends + * up with multiple segments with a single item in each segment. + */ + +#ifndef __EBUCKETS_H +#define __EBUCKETS_H + +#include +#include +#include +#include +#include "rax.h" + +/* + * EB_BUCKET_KEY_PRECISION - Defines the number of bits to ignore from the + * expiration-time when mapping to buckets. The higher the value, the more items + * with similar expiration-time will be aggregated into the same bucket. The lower + * the value, the more "accurate" the active expiration of buckets will be. + * + * Note that the accurate time expiration of each item is preserved anyway and + * enforced by lazy expiration. It only impacts the active expiration that will + * be able to work on buckets older than (1<> EB_BUCKET_KEY_PRECISION) + + +#define EB_EXPIRE_TIME_MAX ((uint64_t)0x0000FFFFFFFFFFFF) /* Maximum expire-time. */ +#define EB_EXPIRE_TIME_INVALID (EB_EXPIRE_TIME_MAX+1) /* assumed bigger than max */ + +/* Handler to ebuckets DS. Pointer to a list, rax or NULL (empty DS). See also ebIsList(). */ +typedef void *ebuckets; + +/* Users of ebuckets will store `eItem` which is just a void pointer to their + * element. In addition, eItem should embed the ExpireMeta struct and supply + * getter function (see EbucketsType.getExpireMeta). + */ +typedef void *eItem; + +/* This struct Should be embedded inside `eItem` and must be aligned in memory. */ +typedef struct ExpireMeta { + /* 48bits of unix-time in msec. This value is sufficient to represent, in + * unix-time, until the date of 02 August, 10889 + */ + uint32_t expireTimeLo; /* Low bits of expireTime. */ + uint16_t expireTimeHi; /* High bits of expireTime. */ + + unsigned int lastInSegment : 1; /* Last item in segment. If set, then 'next' will + point to the NextSegHdr, unless lastItemBucket=1 + then it will point to segment header of the + current segment. */ + unsigned int firstItemBucket : 1; /* First item in bucket. This flag assist + to manipulate segments directly without + the need to traverse from start the + rax tree */ + unsigned int lastItemBucket : 1; /* Last item in bucket. This flag assist + to manipulate segments directly without + the need to traverse from start the + rax tree */ + unsigned int numItems : 5; /* Only first item in segment will maintain + this value. */ + + unsigned int trash : 1; /* This flag indicates whether the ExpireMeta + associated with the item is leftover. + There is always a potential to reuse the + item after removal/deletion. Note that, + the user can still safely O(1) TTL lookup + a given item and verify whether attached + TTL is valid or leftover. See function + ebGetExpireTime(). */ + + unsigned int userData : 3; /* ebuckets can be used to store in same + instance few different types of items, + such as, listpack and hash. This field + is reserved to store such identification + associated with the item and can help + to distinct on delete or expire callback. + It is not used by ebuckets internally and + should be maintained by the user */ + + unsigned int reserved : 4; + + void *next; /* - If not last item in segment then next + points to next eItem (lastInSegment=0). + - If last in segment but not last in + bucket (lastItemBucket=0) then it + points to next segment header. + - If last in bucket then it points to + current segment header (Can be either + of type FirstSegHdr or NextSegHdr). */ +} ExpireMeta; + +/* Each instance of ebuckets need to have corresponding EbucketsType that holds + * the necessary callbacks and configuration to operate correctly on the type + * of items that are stored in it. Conceptually it should have hold reference + * from ebuckets instance to this type, but to save memory we will pass it as + * an argument to each API call. */ +typedef struct EbucketsType { + /* getter to extract the ExpireMeta from the item */ + ExpireMeta* (*getExpireMeta)(const eItem item); + + /* Called during ebDestroy(). Set to NULL if not needed. */ + void (*onDeleteItem)(eItem item, void *ctx); + + /* Is addresses of items are odd in memory. It is taken into consideration + * and used by ebuckets to know how to distinct between ebuckets pointer to + * rax versus a pointer to item which is head of list. */ + unsigned int itemsAddrAreOdd; +} EbucketsType; + +/* Returned value by `onExpireItem` callback to indicate the action to be taken by + * ebExpire(). */ +typedef enum ExpireAction { + ACT_REMOVE_EXP_ITEM=0, /* Remove the item from ebuckets. */ + ACT_UPDATE_EXP_ITEM, /* Re-insert the item with updated expiration-time. + Before returning this value, the cb need to + update expiration time of the item by assisting + function ebSetMetaExpTime(). The item will be + kept aside and will be added again to ebuckets + at the end of ebExpire() */ + ACT_STOP_ACTIVE_EXP /* Stop active-expiration. It will assume that + provided 'item' wasn't deleted by the callback. */ +} ExpireAction; + +/* ExpireInfo is used to pass input and output parameters to ebExpire(). */ +typedef struct ExpireInfo { + /* onExpireItem - Called during active-expiration by ebExpire() */ + ExpireAction (*onExpireItem)(eItem item, void *ctx); + + uint64_t maxToExpire; /* [INPUT ] Limit of number expired items to scan */ + void *ctx; /* [INPUT ] context to pass to onExpireItem */ + uint64_t now; /* [INPUT ] Current time in msec. */ + uint64_t nextExpireTime; /* [OUTPUT] Next expiration time. Return 0, if none left. */ + uint64_t itemsExpired; /* [OUTPUT] Returns the number of expired items. */ +} ExpireInfo; + +/* ebuckets API */ + +static inline ebuckets ebCreate(void) { return NULL; } /* Empty ebuckets */ + +void ebDestroy(ebuckets *eb, EbucketsType *type, void *deletedItemsCbCtx); + +void ebExpire(ebuckets *eb, EbucketsType *type, ExpireInfo *info); + +uint64_t ebExpireDryRun(ebuckets eb, EbucketsType *type, uint64_t now); + +static inline int ebIsEmpty(ebuckets eb) { return eb == NULL; } + +uint64_t ebGetNextTimeToExpire(ebuckets eb, EbucketsType *type); + +uint64_t ebGetMaxExpireTime(ebuckets eb, EbucketsType *type, int accurate); + +uint64_t ebGetTotalItems(ebuckets eb, EbucketsType *type); + +/* Item related API */ + +int ebRemove(ebuckets *eb, EbucketsType *type, eItem item); + +int ebAdd(ebuckets *eb, EbucketsType *type, eItem item, uint64_t expireTime); + +uint64_t ebGetExpireTime(EbucketsType *type, eItem item); + +static inline uint64_t ebGetMetaExpTime(ExpireMeta *expMeta) { + return (((uint64_t)(expMeta)->expireTimeHi << 32) | (expMeta)->expireTimeLo); +} + +static inline void ebSetMetaExpTime(ExpireMeta *expMeta, uint64_t t) { + expMeta->expireTimeLo = (uint32_t)(t&0xFFFFFFFF); + expMeta->expireTimeHi = (uint16_t)((t) >> 32); +} + +/* Debug API */ + +void ebValidate(ebuckets eb, EbucketsType *type); + +void ebPrint(ebuckets eb, EbucketsType *type); + +#ifdef REDIS_TEST +int ebucketsTest(int argc, char *argv[], int flags); +#endif + +#endif /* __EBUCKETS_H */ diff --git a/src/expire.c b/src/expire.c index b73b5245f..646f752a9 100644 --- a/src/expire.c +++ b/src/expire.c @@ -94,6 +94,7 @@ int activeExpireCycleTryExpire(redisDb *db, dictEntry *de, long long now) { #define ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC 25 /* Max % of CPU to use. */ #define ACTIVE_EXPIRE_CYCLE_ACCEPTABLE_STALE 10 /* % of stale keys after which we do extra efforts. */ +#define HFE_ACTIVE_EXPIRE_CYCLE_FIELDS 1000 /* Data used by the expire dict scan callback. */ typedef struct { @@ -134,6 +135,53 @@ static inline int isExpiryDictValidForSamplingCb(dict *d) { return C_OK; } +/* Active expiration Cycle for hash-fields. + * + * Note that releasing fields is expected to be more predictable and rewarding + * than releasing keys because it is stored in `ebuckets` DS which optimized for + * active expiration and in addition the deletion of fields is simple to handle. */ +static inline void activeExpireHashFieldCycle(int type) { + /* Remember current db across calls */ + static unsigned int currentDb = 0; + + /* Tracks the count of fields actively expired for the current database. + * This count continues as long as it fails to actively expire all expired + * fields of currentDb, indicating a possible need to adjust the value of + * maxToExpire. */ + static uint64_t activeExpirySequence = 0; + /* Threshold for adjusting maxToExpire */ + const uint32_t EXPIRED_FIELDS_TH = 1000000; + /* Maximum number of fields to actively expire in a single call */ + uint32_t maxToExpire = HFE_ACTIVE_EXPIRE_CYCLE_FIELDS; + + redisDb *db = server.db + currentDb; + + /* If db is empty, move to next db and return */ + if (ebIsEmpty(db->hexpires)) { + activeExpirySequence = 0; + currentDb = (currentDb + 1) % server.dbnum; + return; + } + + /* If running for a while and didn't manage to active-expire all expired fields of + * currentDb (i.e. activeExpirySequence becomes significant) then adjust maxToExpire */ + if ((activeExpirySequence > EXPIRED_FIELDS_TH) && (type == ACTIVE_EXPIRE_CYCLE_SLOW)) { + /* maxToExpire is multiplied by a factor between 1 and 32, proportional to + * the number of times activeExpirySequence exceeded EXPIRED_FIELDS_TH */ + uint64_t factor = activeExpirySequence / EXPIRED_FIELDS_TH; + maxToExpire *= (factor<32) ? factor : 32; + } + + if (hashTypeDbActiveExpire(db, maxToExpire) == maxToExpire) { + /* active-expire reached maxToExpire limit */ + activeExpirySequence += maxToExpire; + } else { + /* Managed to active-expire all expired fields of currentDb */ + activeExpirySequence = 0; + currentDb = (currentDb + 1) % server.dbnum; + } +} + void activeExpireCycle(int type) { /* Adjust the running parameters according to the configured expire * effort. The default effort is 1, and the maximum configurable effort @@ -232,6 +280,11 @@ void activeExpireCycle(int type) { * distribute the time evenly across DBs. */ current_db++; + /* Interleaving hash-field expiration with key expiration. Better + * call it before handling expired keys because HFE DS is optimized for + * active expiration */ + activeExpireHashFieldCycle(type); + if (kvstoreSize(db->expires)) dbs_performed++; diff --git a/src/lazyfree.c b/src/lazyfree.c index e743cb204..2b98f9a06 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -3,6 +3,7 @@ #include "atomicvar.h" #include "functions.h" #include "cluster.h" +#include "ebuckets.h" static redisAtomic size_t lazyfree_objects = 0; static redisAtomic size_t lazyfreed_objects = 0; @@ -22,7 +23,8 @@ void lazyfreeFreeObject(void *args[]) { void lazyfreeFreeDatabase(void *args[]) { kvstore *da1 = args[0]; kvstore *da2 = args[1]; - + ebuckets oldHfe = args[2]; + ebDestroy(&oldHfe, &hashExpireBucketsType, NULL); size_t numkeys = kvstoreSize(da1); kvstoreRelease(da1); kvstoreRelease(da2); @@ -201,10 +203,12 @@ void emptyDbAsync(redisDb *db) { flags |= KVSTORE_FREE_EMPTY_DICTS; } kvstore *oldkeys = db->keys, *oldexpires = db->expires; + ebuckets oldHfe = db->hexpires; db->keys = kvstoreCreate(&dbDictType, slot_count_bits, flags); db->expires = kvstoreCreate(&dbExpiresDictType, slot_count_bits, flags); + db->hexpires = ebCreate(); atomicIncr(lazyfree_objects, kvstoreSize(oldkeys)); - bioCreateLazyFreeJob(lazyfreeFreeDatabase, 2, oldkeys, oldexpires); + bioCreateLazyFreeJob(lazyfreeFreeDatabase, 3, oldkeys, oldexpires, oldHfe); } /* Free the key tracking table. diff --git a/src/module.c b/src/module.c index b3cfa38c4..11078020f 100644 --- a/src/module.c +++ b/src/module.c @@ -745,7 +745,7 @@ int moduleDelKeyIfEmpty(RedisModuleKey *key) { case OBJ_LIST: isempty = listTypeLength(o) == 0; break; case OBJ_SET: isempty = setTypeSize(o) == 0; break; case OBJ_ZSET: isempty = zsetLength(o) == 0; break; - case OBJ_HASH: isempty = hashTypeLength(o) == 0; break; + case OBJ_HASH: isempty = hashTypeLength(o, 0) == 0; break; case OBJ_STREAM: isempty = streamLength(o) == 0; break; default: isempty = 0; } @@ -4168,7 +4168,7 @@ size_t RM_ValueLength(RedisModuleKey *key) { case OBJ_LIST: return listTypeLength(key->value); case OBJ_SET: return setTypeSize(key->value); case OBJ_ZSET: return zsetLength(key->value); - case OBJ_HASH: return hashTypeLength(key->value); + case OBJ_HASH: return hashTypeLength(key->value, 0); /* OPEN: To subtract expired fields? */ case OBJ_STREAM: return streamLength(key->value); default: return 0; } @@ -5296,7 +5296,7 @@ int RM_HashSet(RedisModuleKey *key, int flags, ...) { robj *argv[2] = {field,value}; hashTypeTryConversion(key->value,argv,0,1); - int updated = hashTypeSet(key->value, field->ptr, value->ptr, low_flags); + int updated = hashTypeSet(key->db, key->value, field->ptr, value->ptr, low_flags); count += (flags & REDISMODULE_HASH_COUNT_ALL) ? 1 : updated; /* If CFIELDS is active, SDS string ownership is now of hashTypeSet(), @@ -11071,18 +11071,22 @@ static void moduleScanKeyCallback(void *privdata, const dictEntry *de) { ScanKeyCBData *data = privdata; sds key = dictGetKey(de); robj *o = data->key->value; - robj *field = createStringObject(key, sdslen(key)); + robj *field = NULL; robj *value = NULL; if (o->type == OBJ_SET) { value = NULL; } else if (o->type == OBJ_HASH) { sds val = dictGetVal(de); + field = createStringObject(key, hfieldlen(key)); value = createStringObject(val, sdslen(val)); } else if (o->type == OBJ_ZSET) { double *val = (double*)dictGetVal(de); value = createStringObjectFromLongDouble(*val, 0); } + /* if type is OBJ_HASH then key is of type hfield. Otherwise sds. */ + if (!field) field = createStringObject(key, sdslen(key)); + data->fn(data->key, field, value, data->user_data); decrRefCount(field); if (value) decrRefCount(value); diff --git a/src/mstr.c b/src/mstr.c new file mode 100644 index 000000000..39200d731 --- /dev/null +++ b/src/mstr.c @@ -0,0 +1,524 @@ +/* + * Copyright Redis Ltd. 2024 - present + * + * Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) + * or the Server Side Public License v1 (SSPLv1). + */ + +#include +#include +#include "sdsalloc.h" +#include "mstr.h" +#include "stdio.h" + +#define NULL_SIZE 1 + +static inline char mstrReqType(size_t string_size); +static inline int mstrHdrSize(char type); +static inline int mstrSumMetaLen(mstrKind *k, mstrFlags flags); +static inline size_t mstrAllocLen(const mstr s, struct mstrKind *kind); + +/*** mstr API ***/ + +/* Create mstr without any metadata attached, based on string 'initStr'. + * - If initStr equals NULL, then only allocation will be made. + * - string of mstr is always null-terminated. + */ +mstr mstrNew(const char *initStr, size_t lenStr, int trymalloc) { + unsigned char *pInfo; /* pointer to mstr info field */ + void *sh; + mstr s; + char type = mstrReqType(lenStr); + int mstrHdr = mstrHdrSize(type); + + assert(lenStr + mstrHdr + 1 > lenStr); /* Catch size_t overflow */ + + size_t len = mstrHdr + lenStr + NULL_SIZE; + sh = trymalloc? s_trymalloc(len) : s_malloc(len); + + if (sh == NULL) return NULL; + + s = (char*)sh + mstrHdr; + pInfo = ((unsigned char*)s) - 1; + + switch(type) { + case MSTR_TYPE_5: { + *pInfo = CREATE_MSTR_INFO(lenStr, 0 /*ismeta*/, type); + break; + } + case MSTR_TYPE_8: { + MSTR_HDR_VAR(8,s); + *pInfo = CREATE_MSTR_INFO(0 /*unused*/, 0 /*ismeta*/, type); + sh->len = lenStr; + break; + } + case MSTR_TYPE_16: { + MSTR_HDR_VAR(16,s); + *pInfo = CREATE_MSTR_INFO(0 /*unused*/, 0 /*ismeta*/, type); + sh->len = lenStr; + break; + } + case MSTR_TYPE_64: { + MSTR_HDR_VAR(64,s); + *pInfo = CREATE_MSTR_INFO(0 /*unused*/, 0 /*ismeta*/, type); + sh->len = lenStr; + break; + } + } + + if (initStr && lenStr) + memcpy(s, initStr, lenStr); + + s[lenStr] = '\0'; + return s; +} + +/* Creates mstr with given string. Reserve space for metadata. + * + * Note: mstrNew(s,l) and mstrNewWithMeta(s,l,0) are not the same. The first allocates + * just string. The second allocates a string with flags (yet without any metadata + * structures allocated). + */ +mstr mstrNewWithMeta(struct mstrKind *kind, const char *initStr, size_t lenStr, mstrFlags metaFlags, int trymalloc) { + unsigned char *pInfo; /* pointer to mstr info field */ + char *allocMstr; + mstr mstrPtr; + char type = mstrReqType(lenStr); + int mstrHdr = mstrHdrSize(type); + int sumMetaLen = mstrSumMetaLen(kind, metaFlags); + + + /* mstrSumMetaLen() + sizeof(mstrFlags) + sizeof(mstrhdrX) + lenStr */ + + size_t allocLen = sumMetaLen + sizeof(mstrFlags) + mstrHdr + lenStr + NULL_SIZE; + allocMstr = trymalloc? s_trymalloc(allocLen) : s_malloc(allocLen); + + if (allocMstr == NULL) return NULL; + + /* metadata is located at the beginning of the allocation, then meta-flags and lastly the string */ + mstrFlags *pMetaFlags = (mstrFlags *) (allocMstr + sumMetaLen) ; + mstrPtr = ((char*) pMetaFlags) + sizeof(mstrFlags) + mstrHdr; + pInfo = ((unsigned char*)mstrPtr) - 1; + + switch(type) { + case MSTR_TYPE_5: { + *pInfo = CREATE_MSTR_INFO(lenStr, 1 /*ismeta*/, type); + break; + } + case MSTR_TYPE_8: { + MSTR_HDR_VAR(8, mstrPtr); + sh->len = lenStr; + *pInfo = CREATE_MSTR_INFO(0 /*unused*/, 1 /*ismeta*/, type); + break; + } + case MSTR_TYPE_16: { + MSTR_HDR_VAR(16, mstrPtr); + sh->len = lenStr; + *pInfo = CREATE_MSTR_INFO(0 /*unused*/, 1 /*ismeta*/, type); + break; + } + case MSTR_TYPE_64: { + MSTR_HDR_VAR(64, mstrPtr); + sh->len = lenStr; + *pInfo = CREATE_MSTR_INFO(0 /*unused*/, 1 /*ismeta*/, type); + break; + } + } + *pMetaFlags = metaFlags; + if (initStr != NULL) memcpy(mstrPtr, initStr, lenStr); + mstrPtr[lenStr] = '\0'; + + return mstrPtr; +} + +/* Create copy of mstr. Flags can be modified. For each metadata flag, if + * same flag is set on both, then copy its metadata. */ +mstr mstrNewCopy(struct mstrKind *kind, mstr src, mstrFlags newFlags) { + mstr dst; + + /* if no flags are set, then just copy the string */ + if (newFlags == 0) return mstrNew(src, mstrlen(src), 0); + + dst = mstrNewWithMeta(kind, src, mstrlen(src), newFlags, 0); + memcpy(dst, src, mstrlen(src) + 1); + + /* if metadata is attached to src, then selectively copy metadata */ + if (mstrIsMetaAttached(src)) { + mstrFlags *pFlags1 = mstrFlagsRef(src), + *pFlags2 = mstrFlagsRef(dst); + + mstrFlags flags1Shift = *pFlags1, + flags2Shift = *pFlags2; + + unsigned char *at1 = ((unsigned char *) pFlags1), + *at2 = ((unsigned char *) pFlags2); + + /* if the flag is set on both, then copy the metadata */ + for (int i = 0; flags1Shift != 0; ++i) { + int isFlag1Set = flags1Shift & 0x1; + int isFlag2Set = flags2Shift & 0x1; + + if (isFlag1Set) at1 -= kind->metaSize[i]; + if (isFlag2Set) at2 -= kind->metaSize[i]; + + if (isFlag1Set && isFlag2Set) + memcpy(at2, at1, kind->metaSize[i]); + flags1Shift >>= 1; + flags2Shift >>= 1; + } + } + return dst; +} + +/* Free mstring. Note, mstrKind is required to eval sizeof metadata and find start + * of allocation but if mstrIsMetaAttached(s) is false, you can pass NULL as well. + */ +void mstrFree(struct mstrKind *kind, mstr s) { + if (s != NULL) + s_free(mstrGetAllocPtr(kind, s)); +} + +/* return ref to metadata flags. Useful to modify directly flags which doesn't + * include metadata payload */ +mstrFlags *mstrFlagsRef(mstr s) { + switch(s[-1]&MSTR_TYPE_MASK) { + case MSTR_TYPE_5: + return ((mstrFlags *) (s - sizeof(struct mstrhdr5))) - 1; + case MSTR_TYPE_8: + return ((mstrFlags *) (s - sizeof(struct mstrhdr8))) - 1; + case MSTR_TYPE_16: + return ((mstrFlags *) (s - sizeof(struct mstrhdr16))) - 1; + default: /* MSTR_TYPE_64: */ + return ((mstrFlags *) (s - sizeof(struct mstrhdr64))) - 1; + } +} + +/* Return a reference to corresponding metadata of the specified metadata flag + * index (flagIdx). If the metadata doesn't exist, it still returns a reference + * to the starting location where it would have been written among other metadatas. + * To verify if `flagIdx` of some metadata is attached, use `mstrGetFlag(s, flagIdx)`. + */ +void *mstrMetaRef(mstr s, struct mstrKind *kind, int flagIdx) { + int metaOffset = 0; + /* start iterating from flags backward */ + mstrFlags *pFlags = mstrFlagsRef(s); + mstrFlags tmp = *pFlags; + + for (int i = 0 ; i <= flagIdx ; ++i) { + if (tmp & 0x1) metaOffset += kind->metaSize[i]; + tmp >>= 1; + } + return ((char *)pFlags) - metaOffset; +} + +/* mstr layout: [meta-data#N]...[meta-data#0][mstrFlags][mstrhdr][string][null] */ +void *mstrGetAllocPtr(struct mstrKind *kind, mstr str) { + if (!mstrIsMetaAttached(str)) + return (char*)str - mstrHdrSize(str[-1]); + + int totalMetaLen = mstrSumMetaLen(kind, *mstrFlagsRef(str)); + return (char*)str - mstrHdrSize(str[-1]) - sizeof(mstrFlags) - totalMetaLen; +} + +/* Prints in the following fashion: + * [0x7f8bd8816017] my_mstr: foo (strLen=3, mstrLen=11, isMeta=1, metaFlags=0x1) + * [0x7f8bd8816010] >> meta[0]: 0x78 0x56 0x34 0x12 (metaLen=4) + */ +void mstrPrint(mstr s, struct mstrKind *kind, int verbose) { + mstrFlags mflags, tmp; + int isMeta = mstrIsMetaAttached(s); + + tmp = mflags = (isMeta) ? *mstrFlagsRef(s) : 0; + + if (!isMeta) { + printf("[%p] %s: %s (strLen=%zu, mstrLen=%zu, isMeta=0)\n", + (void *)s, kind->name, s, mstrlen(s), mstrAllocLen(s, kind)); + return; + } + + printf("[%p] %s: %s (strLen=%zu, mstrLen=%zu, isMeta=1, metaFlags=0x%x)\n", + (void *)s, kind->name, s, mstrlen(s), mstrAllocLen(s, kind), mflags); + + if (verbose) { + for (unsigned int i = 0 ; i < NUM_MSTR_FLAGS ; ++i) { + if (tmp & 0x1) { + int mSize = kind->metaSize[i]; + void *mRef = mstrMetaRef(s, kind, i); + printf("[%p] >> meta[%d]:", mRef, i); + for (int j = 0 ; j < mSize ; ++j) { + printf(" 0x%02x", ((unsigned char *) mRef)[j]); + } + printf(" (metaLen=%d)\n", mSize); + } + tmp >>= 1; + } + } +} + +/* return length of the string (ignoring metadata attached) */ +size_t mstrlen(const mstr s) { + unsigned char info = s[-1]; + switch(info & MSTR_TYPE_MASK) { + case MSTR_TYPE_5: + return MSTR_TYPE_5_LEN(info); + case MSTR_TYPE_8: + return MSTR_HDR(8,s)->len; + case MSTR_TYPE_16: + return MSTR_HDR(16,s)->len; + default: /* MSTR_TYPE_64: */ + return MSTR_HDR(64,s)->len; + } +} + +/*** mstr internals ***/ + +static inline int mstrSumMetaLen(mstrKind *k, mstrFlags flags) { + int total = 0; + int i = 0 ; + while (flags) { + total += (flags & 0x1) ? k->metaSize[i] : 0; + flags >>= 1; + ++i; + } + return total; +} + +/* mstrSumMetaLen() + sizeof(mstrFlags) + sizeof(mstrhdrX) + strlen + '\0' */ +static inline size_t mstrAllocLen(const mstr s, struct mstrKind *kind) { + int hdrlen; + mstrFlags *pMetaFlags; + size_t strlen = 0; + + int isMeta = mstrIsMetaAttached(s); + unsigned char info = s[-1]; + + switch(info & MSTR_TYPE_MASK) { + case MSTR_TYPE_5: + strlen = MSTR_TYPE_5_LEN(info); + hdrlen = sizeof(struct mstrhdr5); + pMetaFlags = ((mstrFlags *) MSTR_HDR(5, s)) - 1; + break; + case MSTR_TYPE_8: + strlen = MSTR_HDR(8,s)->len; + hdrlen = sizeof(struct mstrhdr8); + pMetaFlags = ((mstrFlags *) MSTR_HDR(8, s)) - 1; + break; + case MSTR_TYPE_16: + strlen = MSTR_HDR(16,s)->len; + hdrlen = sizeof(struct mstrhdr16); + pMetaFlags = ((mstrFlags *) MSTR_HDR(16, s)) - 1; + break; + default: /* MSTR_TYPE_64: */ + strlen = MSTR_HDR(64,s)->len; + hdrlen = sizeof(struct mstrhdr64); + pMetaFlags = ((mstrFlags *) MSTR_HDR(64, s)) - 1; + break; + } + return hdrlen + strlen + NULL_SIZE + ((isMeta) ? (mstrSumMetaLen(kind, *pMetaFlags) + sizeof(mstrFlags)) : 0); +} + +/* returns pointer to the beginning of malloc() of mstr */ +void *mstrGetStartAlloc(mstr s, struct mstrKind *kind) { + int hdrlen; + mstrFlags *pMetaFlags; + + int isMeta = mstrIsMetaAttached(s); + + switch(s[-1]&MSTR_TYPE_MASK) { + case MSTR_TYPE_5: + hdrlen = sizeof(struct mstrhdr5); + pMetaFlags = ((mstrFlags *) MSTR_HDR(5, s)) - 1; + break; + case MSTR_TYPE_8: + hdrlen = sizeof(struct mstrhdr8); + pMetaFlags = ((mstrFlags *) MSTR_HDR(8, s)) - 1; + break; + case MSTR_TYPE_16: + hdrlen = sizeof(struct mstrhdr16); + pMetaFlags = ((mstrFlags *) MSTR_HDR(16, s)) - 1; + break; + default: /* MSTR_TYPE_64: */ + hdrlen = sizeof(struct mstrhdr64); + pMetaFlags = ((mstrFlags *) MSTR_HDR(64, s)) - 1; + break; + } + return (char *) s - hdrlen - ((isMeta) ? (mstrSumMetaLen(kind, *pMetaFlags) + sizeof(mstrFlags)) : 0); +} + +static inline int mstrHdrSize(char type) { + switch(type&MSTR_TYPE_MASK) { + case MSTR_TYPE_5: + return sizeof(struct mstrhdr5); + case MSTR_TYPE_8: + return sizeof(struct mstrhdr8); + case MSTR_TYPE_16: + return sizeof(struct mstrhdr16); + case MSTR_TYPE_64: + return sizeof(struct mstrhdr64); + } + return 0; +} + +static inline char mstrReqType(size_t string_size) { + if (string_size < 1<<5) + return MSTR_TYPE_5; + if (string_size < 1<<8) + return MSTR_TYPE_8; + if (string_size < 1<<16) + return MSTR_TYPE_16; + return MSTR_TYPE_64; +} + +#ifdef REDIS_TEST +#include +#include +#include "testhelp.h" +#include "limits.h" + +#ifndef UNUSED +#define UNUSED(x) (void)(x) +#endif + +/* Challenge mstr with metadata interesting enough that can include the case of hfield and hkey and more */ +#define B(idx) (1<<(idx)) + +#define META_IDX_MYMSTR_TTL4 0 +#define META_IDX_MYMSTR_TTL8 1 +#define META_IDX_MYMSTR_TYPE_ENC_LRU 2 // 4Bbit type, 4bit encoding, 24bits lru +#define META_IDX_MYMSTR_VALUE_PTR 3 +#define META_IDX_MYMSTR_FLAG_NO_META 4 + +#define TEST_CONTEXT(context) printf("\nContext: %s \n", context); + +int mstrTest(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + struct mstrKind kind_mymstr = { + .name = "my_mstr", + .metaSize[META_IDX_MYMSTR_TTL4] = 4, + .metaSize[META_IDX_MYMSTR_TTL8] = 8, + .metaSize[META_IDX_MYMSTR_TYPE_ENC_LRU] = 4, + .metaSize[META_IDX_MYMSTR_VALUE_PTR] = 8, + .metaSize[META_IDX_MYMSTR_FLAG_NO_META] = 0, + }; + + TEST_CONTEXT("Create simple short mstr") + { + char *str = "foo"; + mstr s = mstrNew(str, strlen(str), 0); + size_t expStrLen = strlen(str); + + test_cond("Verify str length and alloc length", + mstrAllocLen(s, NULL) == (1 + expStrLen + 1) && /* mstrhdr5 + str + null */ + mstrlen(s) == expStrLen && /* expected strlen(str) */ + memcmp(s, str, expStrLen + 1) == 0); + mstrFree(&kind_mymstr, s); + } + + TEST_CONTEXT("Create simple 40 bytes mstr") + { + char *str = "0123456789012345678901234567890123456789"; // 40 bytes + mstr s = mstrNew(str, strlen(str), 0); + + test_cond("Verify str length and alloc length", + mstrAllocLen(s, NULL) == (3 + 40 + 1) && /* mstrhdr8 + str + null */ + mstrlen(s) == 40 && + memcmp(s,str,40) == 0); + mstrFree(&kind_mymstr, s); + } + + TEST_CONTEXT("Create mstr with random characters") + { + long unsigned int i; + char str[66000]; + for (i = 0 ; i < sizeof(str) ; ++i) str[i] = rand() % 256; + + size_t len[] = { 31, 32, 33, 255, 256, 257, 65535, 65536, 65537, 66000}; + for (i = 0 ; i < sizeof(len) / sizeof(len[0]) ; ++i) { + char title[100]; + mstr s = mstrNew(str, len[i], 0); + size_t mstrhdrSize = (len[i] < 1<<5) ? sizeof(struct mstrhdr5) : + (len[i] < 1<<8) ? sizeof(struct mstrhdr8) : + (len[i] < 1<<16) ? sizeof(struct mstrhdr16) : + sizeof(struct mstrhdr64); + + snprintf(title, sizeof(title), "Verify string of length %zu", len[i]); + test_cond(title, + mstrAllocLen(s, NULL) == (mstrhdrSize + len[i] + 1) && /* mstrhdrX + str + null */ + mstrlen(s) == len[i] && + memcmp(s,str,len[i]) == 0); + mstrFree(&kind_mymstr, s); + } + } + + TEST_CONTEXT("Create short mstr with TTL4") + { + uint32_t *ttl; + mstr s = mstrNewWithMeta(&kind_mymstr, + "foo", + strlen("foo"), + B(META_IDX_MYMSTR_TTL4), /* allocate with TTL4 metadata */ + 0); + + ttl = mstrMetaRef(s, &kind_mymstr, META_IDX_MYMSTR_TTL4); + *ttl = 0x12345678; + + test_cond("Verify memory-allocation and string lengths", + mstrAllocLen(s, &kind_mymstr) == (1 + 3 + 2 + 1 + 4) && /* mstrhdr5 + str + null + mstrFlags + TLL */ + mstrlen(s) == 3); + + unsigned char expMem[] = {0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x00, 0x1c, 'f', 'o', 'o', '\0' }; + uint32_t value = 0x12345678; + memcpy(expMem, &value, sizeof(uint32_t)); + test_cond("Verify string and TTL4 payload", memcmp( + mstrMetaRef(s, &kind_mymstr, 0) , expMem, sizeof(expMem)) == 0); + + test_cond("Verify mstrIsMetaAttached() function works", mstrIsMetaAttached(s) != 0); + + mstrFree(&kind_mymstr, s); + } + + TEST_CONTEXT("Create short mstr with TTL4 and value ptr ") + { + mstr s = mstrNewWithMeta(&kind_mymstr, "foo", strlen("foo"), + B(META_IDX_MYMSTR_TTL4) | B(META_IDX_MYMSTR_VALUE_PTR), 0); + *((uint32_t *) (mstrMetaRef(s, &kind_mymstr, + META_IDX_MYMSTR_TTL4))) = 0x12345678; + + test_cond("Verify length and alloc length", + mstrAllocLen(s, &kind_mymstr) == (1 + 3 + 1 + 2 + 4 + 8) && /* mstrhdr5 + str + null + mstrFlags + TLL + PTR */ + mstrlen(s) == 3); + mstrFree(&kind_mymstr, s); + } + + TEST_CONTEXT("Copy mstr and add it TTL4") + { + mstr s1 = mstrNew("foo", strlen("foo"), 0); + mstr s2 = mstrNewCopy(&kind_mymstr, s1, B(META_IDX_MYMSTR_TTL4)); + *((uint32_t *) (mstrMetaRef(s2, &kind_mymstr, META_IDX_MYMSTR_TTL4))) = 0x12345678; + + test_cond("Verify new mstr includes TTL4", + mstrAllocLen(s2, &kind_mymstr) == (1 + 3 + 1 + 2 + 4) && /* mstrhdr5 + str + null + mstrFlags + TTL4 */ + mstrlen(s2) == 3 && /* 'foo' = 3bytes */ + memcmp(s2, "foo\0", 4) == 0); + + mstr s3 = mstrNewCopy(&kind_mymstr, s2, B(META_IDX_MYMSTR_TTL4)); + unsigned char expMem[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0x1, 0x0, 0x1c, 'f', 'o', 'o', '\0' }; + uint32_t value = 0x12345678; + memcpy(expMem, &value, sizeof(uint32_t)); + + char *ppp = mstrGetStartAlloc(s3, &kind_mymstr); + test_cond("Verify string and TTL4 payload", + memcmp(ppp, expMem, sizeof(expMem)) == 0); + + mstrPrint(s3, &kind_mymstr, 1); + mstrFree(&kind_mymstr, s1); + mstrFree(&kind_mymstr, s2); + mstrFree(&kind_mymstr, s3); + } + + return 0; +} +#endif diff --git a/src/mstr.h b/src/mstr.h new file mode 100644 index 000000000..fa7d4b214 --- /dev/null +++ b/src/mstr.h @@ -0,0 +1,223 @@ +/* + * Copyright Redis Ltd. 2024 - present + * + * Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) + * or the Server Side Public License v1 (SSPLv1). + * + * + * WHAT IS MSTR (M-STRING)? + * ------------------------ + * mstr stands for immutable string with optional metadata attached. + * + * sds string is widely used across the system and serves as a general purpose + * container to hold data. The need to optimize memory and aggregate strings + * along with metadata and store it into Redis data-structures as single bulk keep + * reoccur. One thought might be, why not to extend sds to support metadata. The + * answer is that sds is mutable string in its nature, with wide API (split, join, + * etc.). Pushing metadata logic into sds will make it very fragile, and complex + * to maintain. + * + * Another idea involved using a simple struct with flags and a dynamic buf[] at the + * end. While this could be viable, it introduces considerable complexity and would + * need maintenance across different contexts. + * + * As an alternative, we introduce a new implementation of immutable strings, + * with limited API, and with the option to attach metadata. The representation + * of the string, without any metadata, in its basic form, resembles SDS but + * without the API to manipulate the string. Only to attach metadata to it. The + * following diagram shows the memory layout of mstring (mstrhdr8) when no + * metadata is attached: + * + * +----------------------------------------------+ + * | mstrhdr8 | c-string | | + * +--------------------------------+-------------+ + * |8b |2b |1b |5b |?bytes |8b| + * | Len | Type |m-bit=0 | Unused | String |\0| + * +----------------------------------------------+ + * ^ + * | + * mstrNew() returns pointer to here --+ + * + * If metadata-flag is set, depicted in diagram above as m-bit in the diagram, + * then the header will be preceded with additional 16 bits of metadata flags such + * that if i'th bit is set, then the i'th metadata structure is attached to the + * mstring. The metadata layout and their sizes are defined by mstrKind structure + * (More below). + * + * The following diagram shows the memory layout of mstr (mstrhdr8) when 3 bits in mFlags + * are set to indicate that 3 fields of metadata are attached to the mstring at the + * beginning. + * + * +-------------------------------------------------------------------------------+ + * | METADATA FIELDS | mflags | mstrhdr8 | c-string | | + * +-----------------------+--------+--------------------------------+-------------+ + * |?bytes |?bytes |?bytes |16b |8b |2b |1b |5b |?bytes |8b| + * | Meta3 | Meta2 | Meta0 | 0x1101 | Len | Type |m-bit=1 | Unused | String |\0| + * +-------------------------------------------------------------------------------+ + * ^ + * | + * mstrNewWithMeta() returns pointer to here --+ + * + * mstr allows to define different kinds (groups) of mstrings, each with its + * own unique metadata layout. For example, in case of hash-fields, all instances of + * it can optionally have TTL metadata attached to it. This is achieved by first + * prototyping a single mstrKind structure that defines the metadata layout and sizes + * of this specific kind. Now each hash-field instance has still the freedom to + * attach or not attach the metadata to it, and metadata flags (mFlags) of the + * instance will reflect this decision. + * + * In the future, the keys of Redis keyspace can be another kind of mstring that + * has TTL, LRU or even dictEntry metadata embedded into. Unlike vptr in c++, this + * struct won't be attached to mstring but will be passed as yet another argument + * to API, to save memory. In addition, each instance of a given mstrkind can hold + * any subset of metadata and the 8 bits of metadata-flags will reflect it. + * + * The following example shows how to define mstrKind for possible future keyspace + * that aggregates several keyspace related metadata into one compact, singly + * allocated, mstring. + * + * typedef enum HkeyMetaFlags { + * HKEY_META_VAL_REF_COUNT = 0, // refcount + * HKEY_META_VAL_REF = 1, // Val referenced + * HKEY_META_EXPIRE = 2, // TTL and more + * HKEY_META_TYPE_ENC_LRU = 3, // TYPE + LRU + ENC + * HKEY_META_DICT_ENT_NEXT = 4, // Next dict entry + * // Following two must be together and in this order + * HKEY_META_VAL_EMBED8 = 5, // Val embedded, max 7 bytes + * HKEY_META_VAL_EMBED16 = 6, // Val embedded, max 15 bytes (23 with EMBED8) + * } HkeyMetaFlags; + * + * mstrKind hkeyKind = { + * .name = "hkey", + * .metaSize[HKEY_META_VAL_REF_COUNT] = 4, + * .metaSize[HKEY_META_VAL_REF] = 8, + * .metaSize[HKEY_META_EXPIRE] = sizeof(ExpireMeta), + * .metaSize[HKEY_META_TYPE_ENC_LRU] = 8, + * .metaSize[HKEY_META_DICT_ENT_NEXT] = 8, + * .metaSize[HKEY_META_VAL_EMBED8] = 8, + * .metaSize[HKEY_META_VAL_EMBED16] = 16, + * }; + * + * MSTR-ALIGNMENT + * -------------- + * There are two types of alignments to take into consideration: + * 1. Alignment of the metadata. + * 2. Alignment of returned mstr pointer + * + * 1) As the metadatas layout are reversed to their enumeration, it is recommended + * to put metadata with "better" alignment first in memory layout (enumerated + * last) and the worst, or those that simply don't require any alignment will be + * last in memory layout (enumerated first). This is similar the to the applied + * consideration when defining new struct in C. Note also that each metadata + * might either be attached to mstr or not which complicates the design phase + * of a new mstrKind a little. + * + * In the example above, HKEY_META_VAL_REF_COUNT, with worst alignment of 4 + * bytes, is enumerated first, and therefore, will be last in memory layout. + * + * 2) Few optimizations in Redis rely on the fact that sds address is always an odd + * pointer. We can achieve the same with a little effort. It was already taken + * care that all headers of type mstrhdrX has odd size. With that in mind, if + * a new kind of mstr is required to be limited to odd addresses, then we must + * make sure that sizes of all related metadatas that are defined in mstrKind + * are even in size. + */ + +#ifndef __MSTR_H +#define __MSTR_H + +#include +#include +#include + +/* Selective copy of ifndef from server.h instead of including it */ +#ifndef static_assert +#define static_assert(expr, lit) extern char __static_assert_failure[(expr) ? 1:-1] +#endif + +#define MSTR_TYPE_5 0 +#define MSTR_TYPE_8 1 +#define MSTR_TYPE_16 2 +#define MSTR_TYPE_64 3 +#define MSTR_TYPE_MASK 3 +#define MSTR_TYPE_BITS 2 + +#define MSTR_META_MASK 4 + +#define MSTR_HDR(T,s) ((struct mstrhdr##T *)((s)-(sizeof(struct mstrhdr##T)))) +#define MSTR_HDR_VAR(T,s) struct mstrhdr##T *sh = (void*)((s)-(sizeof(struct mstrhdr##T))); + +#define MSTR_META_BITS 1 /* is metadata attached? */ +#define MSTR_TYPE_5_LEN(f) ((f) >> (MSTR_TYPE_BITS + MSTR_META_BITS)) +#define CREATE_MSTR_INFO(len, ismeta, type) ( (((len<ptr * for a string object. This includes internal fragmentation. */ size_t getStringObjectSdsUsedMemory(robj *o) { diff --git a/src/notify.c b/src/notify.c index 16b2b04ef..237716699 100644 --- a/src/notify.c +++ b/src/notify.c @@ -80,7 +80,7 @@ sds keyspaceEventsFlagsToString(int flags) { * 'event' is a C string representing the event name. * 'key' is a Redis object representing the key name. * 'dbid' is the database ID where the key lives. */ -void notifyKeyspaceEvent(int type, char *event, robj *key, int dbid) { +void notifyKeyspaceEvent(int type, const char *event, robj *key, int dbid) { sds chan; robj *chanobj, *eventobj; int len = -1; diff --git a/src/object.c b/src/object.c index b4c8383e3..c368dd0df 100644 --- a/src/object.c +++ b/src/object.c @@ -979,7 +979,6 @@ size_t streamRadixTreeMemoryUsage(rax *rax) { * are checked and averaged to estimate the total size. */ #define OBJ_COMPUTE_SIZE_DEF_SAMPLES 5 /* Default sample size. */ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { - sds ele, ele2; dict *d; dictIterator *di; struct dictEntry *de; @@ -1016,7 +1015,7 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { di = dictGetIterator(d); asize = sizeof(*o)+sizeof(dict)+(sizeof(struct dictEntry*)*dictBuckets(d)); while((de = dictNext(di)) != NULL && samples < sample_size) { - ele = dictGetKey(de); + sds ele = dictGetKey(de); elesize += dictEntryMemUsage() + sdsZmallocSize(ele); samples++; } @@ -1057,9 +1056,9 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { di = dictGetIterator(d); asize = sizeof(*o)+sizeof(dict)+(sizeof(struct dictEntry*)*dictBuckets(d)); while((de = dictNext(di)) != NULL && samples < sample_size) { - ele = dictGetKey(de); - ele2 = dictGetVal(de); - elesize += sdsZmallocSize(ele) + sdsZmallocSize(ele2); + hfield ele = dictGetKey(de); + sds ele2 = dictGetVal(de); + elesize += hfieldZmallocSize(ele) + sdsZmallocSize(ele2); elesize += dictEntryMemUsage(); samples++; } diff --git a/src/rax.c b/src/rax.c index afd4dfe58..491e50aa0 100644 --- a/src/rax.c +++ b/src/rax.c @@ -173,11 +173,16 @@ raxNode *raxNewNode(size_t children, int datafield) { /* Allocate a new rax and return its pointer. On out of memory the function * returns NULL. */ rax *raxNew(void) { - rax *rax = rax_malloc(sizeof(*rax)); + return raxNewWithMetadata(0); +} + +/* Allocate a new rax with metadata */ +rax *raxNewWithMetadata(int metaSize) { + rax *rax = rax_malloc(sizeof(*rax) + metaSize); if (rax == NULL) return NULL; rax->numele = 0; rax->numnodes = 1; - rax->head = raxNewNode(0,0); + rax->head = raxNewNode(0, 0); if (rax->head == NULL) { rax_free(rax); return NULL; @@ -1210,6 +1215,25 @@ void raxRecursiveFree(rax *rax, raxNode *n, void (*free_callback)(void*)) { rax->numnodes--; } +/* Same as raxRecursiveFree() with context argument */ +void raxRecursiveFreeWithCtx(rax *rax, raxNode *n, + void (*free_callback)(void *item, void *ctx), void *ctx) { + debugnode("free traversing",n); + int numchildren = n->iscompr ? 1 : n->size; + raxNode **cp = raxNodeLastChildPtr(n); + while(numchildren--) { + raxNode *child; + memcpy(&child,cp,sizeof(child)); + raxRecursiveFreeWithCtx(rax,child,free_callback, ctx); + cp--; + } + debugnode("free depth-first",n); + if (free_callback && n->iskey && !n->isnull) + free_callback(raxGetData(n), ctx); + rax_free(n); + rax->numnodes--; +} + /* Free a whole radix tree, calling the specified callback in order to * free the auxiliary data. */ void raxFreeWithCallback(rax *rax, void (*free_callback)(void*)) { @@ -1218,6 +1242,15 @@ void raxFreeWithCallback(rax *rax, void (*free_callback)(void*)) { rax_free(rax); } +/* Free a whole radix tree, calling the specified callback in order to + * free the auxiliary data. */ +void raxFreeWithCbAndContext(rax *rax, + void (*free_callback)(void *item, void *ctx), void *ctx) { + raxRecursiveFreeWithCtx(rax,rax->head,free_callback,ctx); + assert(rax->numnodes == 0); + rax_free(rax); +} + /* Free a whole radix tree. */ void raxFree(rax *rax) { raxFreeWithCallback(rax,NULL); diff --git a/src/rax.h b/src/rax.h index c3d182a2b..74963acad 100644 --- a/src/rax.h +++ b/src/rax.h @@ -113,6 +113,7 @@ typedef struct rax { raxNode *head; uint64_t numele; uint64_t numnodes; + void *metadata[]; } rax; /* Stack data structure used by raxLowWalk() in order to, optionally, return @@ -166,12 +167,16 @@ typedef struct raxIterator { /* Exported API. */ rax *raxNew(void); +rax *raxNewWithMetadata(int metaSize); int raxInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old); int raxTryInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old); int raxRemove(rax *rax, unsigned char *s, size_t len, void **old); int raxFind(rax *rax, unsigned char *s, size_t len, void **value); void raxFree(rax *rax); void raxFreeWithCallback(rax *rax, void (*free_callback)(void*)); +void raxFreeWithCbAndContext(rax *rax, + void (*free_callback)(void *item, void *ctx), + void *ctx); void raxStart(raxIterator *it, rax *rt); int raxSeek(raxIterator *it, const char *op, unsigned char *ele, size_t len); int raxNext(raxIterator *it); diff --git a/src/rdb.c b/src/rdb.c index 4ed3726b2..f190538ad 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -268,8 +268,9 @@ int rdbEncodeInteger(long long value, unsigned char *enc) { * The returned value changes according to the flags, see * rdbGenericLoadStringObject() for more info. */ void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) { - int plain = flags & RDB_LOAD_PLAIN; - int sds = flags & RDB_LOAD_SDS; + int plainFlag = flags & RDB_LOAD_PLAIN; + int sdsFlag = flags & RDB_LOAD_SDS; + int hfldFlag = flags & RDB_LOAD_HFLD; int encode = flags & RDB_LOAD_ENC; unsigned char enc[4]; long long val; @@ -295,11 +296,17 @@ void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) { rdbReportCorruptRDB("Unknown RDB integer encoding type %d",enctype); return NULL; /* Never reached. */ } - if (plain || sds) { + if (plainFlag || sdsFlag || hfldFlag) { char buf[LONG_STR_SIZE], *p; int len = ll2string(buf,sizeof(buf),val); if (lenptr) *lenptr = len; - p = plain ? zmalloc(len) : sdsnewlen(SDS_NOINIT,len); + if (plainFlag) { + p = zmalloc(len); + } else if (sdsFlag) { + p = sdsnewlen(SDS_NOINIT,len); + } else { /* hfldFlag */ + p = hfieldNew(NULL, len, 0); + } memcpy(p,buf,len); return p; } else if (encode) { @@ -368,8 +375,11 @@ ssize_t rdbSaveLzfStringObject(rio *rdb, unsigned char *s, size_t len) { * changes according to 'flags'. For more info check the * rdbGenericLoadStringObject() function. */ void *rdbLoadLzfStringObject(rio *rdb, int flags, size_t *lenptr) { - int plain = flags & RDB_LOAD_PLAIN; - int sds = flags & RDB_LOAD_SDS; + int plainFlag = flags & RDB_LOAD_PLAIN; + int sdsFlag = flags & RDB_LOAD_SDS; + int hfldFlag = flags & RDB_LOAD_HFLD; + int robjFlag = (!(plainFlag || sdsFlag || hfldFlag)); /* not plain/sds/hfld */ + uint64_t len, clen; unsigned char *c = NULL; char *val = NULL; @@ -382,11 +392,14 @@ void *rdbLoadLzfStringObject(rio *rdb, int flags, size_t *lenptr) { } /* Allocate our target according to the uncompressed size. */ - if (plain) { + if (plainFlag) { val = ztrymalloc(len); - } else { + } else if (sdsFlag || robjFlag) { val = sdstrynewlen(SDS_NOINIT,len); + } else { /* hfldFlag */ + val = hfieldTryNew(NULL, len, 0); } + if (!val) { serverLog(isRestoreContext()? LL_VERBOSE: LL_WARNING, "rdbLoadLzfStringObject failed allocating %llu bytes", (unsigned long long)len); goto err; @@ -402,17 +415,17 @@ void *rdbLoadLzfStringObject(rio *rdb, int flags, size_t *lenptr) { } zfree(c); - if (plain || sds) { - return val; - } else { - return createObject(OBJ_STRING,val); - } + return (robjFlag) ? createObject(OBJ_STRING,val) : (void *) val; + err: zfree(c); - if (plain) + if (plainFlag) { zfree(val); - else + } else if (sdsFlag || robjFlag) { sdsfree(val); + } else { /* hfldFlag*/ + hfieldFree(val); + } return NULL; } @@ -495,8 +508,12 @@ ssize_t rdbSaveStringObject(rio *rdb, robj *obj) { * On I/O error NULL is returned. */ void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) { - int plain = flags & RDB_LOAD_PLAIN; - int sds = flags & RDB_LOAD_SDS; + void *buf; + int plainFlag = flags & RDB_LOAD_PLAIN; + int sdsFlag = flags & RDB_LOAD_SDS; + int hfldFlag = flags & RDB_LOAD_HFLD; + int robjFlag = (!(plainFlag || sdsFlag || hfldFlag)); /* not plain/sds/hfld */ + int isencoded; unsigned long long len; @@ -517,22 +534,8 @@ void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) { } } - if (plain || sds) { - void *buf = plain ? ztrymalloc(len) : sdstrynewlen(SDS_NOINIT,len); - if (!buf) { - serverLog(isRestoreContext()? LL_VERBOSE: LL_WARNING, "rdbGenericLoadStringObject failed allocating %llu bytes", len); - return NULL; - } - if (lenptr) *lenptr = len; - if (len && rioRead(rdb,buf,len) == 0) { - if (plain) - zfree(buf); - else - sdsfree(buf); - return NULL; - } - return buf; - } else { + /* return robj */ + if (robjFlag) { robj *o = tryCreateStringObject(SDS_NOINIT,len); if (!o) { serverLog(isRestoreContext()? LL_VERBOSE: LL_WARNING, "rdbGenericLoadStringObject failed allocating %llu bytes", len); @@ -544,6 +547,32 @@ void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) { } return o; } + + /* plain/sds/hfld */ + if (plainFlag) { + buf = ztrymalloc(len); + } else if (sdsFlag) { + buf = sdstrynewlen(SDS_NOINIT,len); + } else { /* hfldFlag */ + buf = hfieldTryNew(NULL, len, 0); + } + if (!buf) { + serverLog(isRestoreContext()? LL_VERBOSE: LL_WARNING, "rdbGenericLoadStringObject failed allocating %llu bytes", len); + return NULL; + } + + if (lenptr) *lenptr = len; + if (len && rioRead(rdb,buf,len) == 0) { + if (plainFlag) + zfree(buf); + else if (sdsFlag) { + sdsfree(buf); + } else { /* hfldFlag */ + hfieldFree(buf); + } + return NULL; + } + return buf; } robj *rdbLoadStringObject(rio *rdb) { @@ -924,11 +953,11 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) { nwritten += n; while((de = dictNext(di)) != NULL) { - sds field = dictGetKey(de); + hfield field = dictGetKey(de); sds value = dictGetVal(de); if ((n = rdbSaveRawString(rdb,(unsigned char*)field, - sdslen(field))) == -1) + hfieldlen(field))) == -1) { dictReleaseIterator(di); return -1; @@ -1403,7 +1432,7 @@ werr: return C_ERR; } -/* This helper function is only used for diskless replication. +/* This helper function is only used for diskless replication. * This is just a wrapper to rdbSaveRio() that additionally adds a prefix * and a suffix to the generated RDB dump. The prefix is: * @@ -1856,7 +1885,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { decrRefCount(ele); } - listTypeTryConversion(o,LIST_CONV_AUTO,NULL,NULL); + listTypeTryConversion(o, LIST_CONV_AUTO, NULL, NULL); } else if (rdbtype == RDB_TYPE_SET) { /* Read Set value */ if ((len = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL; @@ -1869,7 +1898,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { o = createSetObject(); /* It's faster to expand the dict to the right size asap in order * to avoid rehashing */ - if (len > DICT_HT_INITIAL_SIZE && dictTryExpand(o->ptr,len) != DICT_OK) { + if (len > DICT_HT_INITIAL_SIZE && dictTryExpand(o->ptr, len) != DICT_OK) { rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len); decrRefCount(o); return NULL; @@ -1896,7 +1925,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { /* Fetch integer value from element. */ if (isSdsRepresentableAsLongLong(sdsele,&llval) == C_OK) { uint8_t success; - o->ptr = intsetAdd(o->ptr,llval,&success); + o->ptr = intsetAdd(o->ptr, llval, &success); if (!success) { rdbReportCorruptRDB("Duplicate set members detected"); decrRefCount(o); @@ -1946,7 +1975,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { /* This will also be called when the set was just converted * to a regular hash table encoded set. */ if (o->encoding == OBJ_ENCODING_HT) { - if (dictAdd((dict*)o->ptr,sdsele,NULL) != DICT_OK) { + if (dictAdd((dict*)o->ptr, sdsele, NULL) != DICT_OK) { rdbReportCorruptRDB("Duplicate set members detected"); decrRefCount(o); sdsfree(sdsele); @@ -2024,12 +2053,13 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { maxelelen <= server.zset_max_listpack_value && lpSafeToAdd(NULL, totelelen)) { - zsetConvert(o,OBJ_ENCODING_LISTPACK); + zsetConvert(o, OBJ_ENCODING_LISTPACK); } } else if (rdbtype == RDB_TYPE_HASH) { uint64_t len; int ret; - sds field, value; + sds value; + hfield field; dict *dupSearchDict = NULL; len = rdbLoadLen(rdb, NULL); @@ -2054,43 +2084,46 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { while (o->encoding == OBJ_ENCODING_LISTPACK && len > 0) { len--; /* Load raw strings */ - if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) { + if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_HFLD,NULL)) == NULL) { decrRefCount(o); if (dupSearchDict) dictRelease(dupSearchDict); return NULL; } if ((value = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) { - sdsfree(field); + hfieldFree(field); decrRefCount(o); if (dupSearchDict) dictRelease(dupSearchDict); return NULL; } if (dupSearchDict) { - sds field_dup = sdsdup(field); + sds field_dup = sdsnewlen(field, hfieldlen(field)); + if (dictAdd(dupSearchDict, field_dup, NULL) != DICT_OK) { rdbReportCorruptRDB("Hash with dup elements"); dictRelease(dupSearchDict); decrRefCount(o); sdsfree(field_dup); - sdsfree(field); + hfieldFree(field); sdsfree(value); return NULL; } } /* Convert to hash table if size threshold is exceeded */ - if (sdslen(field) > server.hash_max_listpack_value || + if (hfieldlen(field) > server.hash_max_listpack_value || sdslen(value) > server.hash_max_listpack_value || - !lpSafeToAdd(o->ptr, sdslen(field)+sdslen(value))) + !lpSafeToAdd(o->ptr, hfieldlen(field) + sdslen(value))) { hashTypeConvert(o, OBJ_ENCODING_HT); + dictUseStoredKeyApi((dict *)o->ptr, 1); ret = dictAdd((dict*)o->ptr, field, value); + dictUseStoredKeyApi((dict *)o->ptr, 0); if (ret == DICT_ERR) { rdbReportCorruptRDB("Duplicate hash fields detected"); if (dupSearchDict) dictRelease(dupSearchDict); sdsfree(value); - sdsfree(field); + hfieldFree(field); decrRefCount(o); return NULL; } @@ -2098,10 +2131,10 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } /* Add pair to listpack */ - o->ptr = lpAppend(o->ptr, (unsigned char*)field, sdslen(field)); + o->ptr = lpAppend(o->ptr, (unsigned char*)field, hfieldlen(field)); o->ptr = lpAppend(o->ptr, (unsigned char*)value, sdslen(value)); - sdsfree(field); + hfieldFree(field); sdsfree(value); } @@ -2113,7 +2146,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } if (o->encoding == OBJ_ENCODING_HT && len > DICT_HT_INITIAL_SIZE) { - if (dictTryExpand(o->ptr,len) != DICT_OK) { + if (dictTryExpand(o->ptr, len) != DICT_OK) { rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len); decrRefCount(o); return NULL; @@ -2124,22 +2157,25 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { while (o->encoding == OBJ_ENCODING_HT && len > 0) { len--; /* Load encoded strings */ - if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) { + if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_HFLD,NULL)) == NULL) { decrRefCount(o); return NULL; } if ((value = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) { - sdsfree(field); + hfieldFree(field); decrRefCount(o); return NULL; } /* Add pair to hash table */ - ret = dictAdd((dict*)o->ptr, field, value); + dict *d = o->ptr; + dictUseStoredKeyApi(d, 1); + ret = dictAdd(d, field, value); + dictUseStoredKeyApi(d, 0); if (ret == DICT_ERR) { rdbReportCorruptRDB("Duplicate hash fields detected"); sdsfree(value); - sdsfree(field); + hfieldFree(field); decrRefCount(o); return NULL; } @@ -2221,7 +2257,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { goto emptykey; } - listTypeTryConversion(o,LIST_CONV_AUTO,NULL,NULL); + listTypeTryConversion(o, LIST_CONV_AUTO, NULL, NULL); } else if (rdbtype == RDB_TYPE_HASH_ZIPMAP || rdbtype == RDB_TYPE_LIST_ZIPLIST || rdbtype == RDB_TYPE_SET_INTSET || @@ -2236,7 +2272,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN,&encoded_len); if (encoded == NULL) return NULL; - o = createObject(OBJ_STRING,encoded); /* Obj type fixed below. */ + o = createObject(OBJ_STRING, encoded); /* Obj type fixed below. */ /* Fix the object encoding, and make sure to convert the encoded * data type into the base type if accordingly to the current @@ -2292,14 +2328,14 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { o->type = OBJ_HASH; o->encoding = OBJ_ENCODING_LISTPACK; - if (hashTypeLength(o) > server.hash_max_listpack_entries || + if (hashTypeLength(o, 0) > server.hash_max_listpack_entries || maxlen > server.hash_max_listpack_value) { hashTypeConvert(o, OBJ_ENCODING_HT); } } break; - case RDB_TYPE_LIST_ZIPLIST: + case RDB_TYPE_LIST_ZIPLIST: { quicklist *ql = quicklistNew(server.list_max_listpack_size, server.list_compress_depth); @@ -2341,7 +2377,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { o->type = OBJ_SET; o->encoding = OBJ_ENCODING_INTSET; if (intsetLen(o->ptr) > server.set_max_intset_entries) - setTypeConvert(o,OBJ_ENCODING_HT); + setTypeConvert(o, OBJ_ENCODING_HT); break; case RDB_TYPE_SET_LISTPACK: if (deep_integrity_validation) server.stat_dump_payload_sanitizations++; @@ -2386,7 +2422,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } if (zsetLength(o) > server.zset_max_listpack_entries) - zsetConvert(o,OBJ_ENCODING_SKIPLIST); + zsetConvert(o, OBJ_ENCODING_SKIPLIST); else o->ptr = lpShrinkToFit(o->ptr); break; @@ -2408,7 +2444,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } if (zsetLength(o) > server.zset_max_listpack_entries) - zsetConvert(o,OBJ_ENCODING_SKIPLIST); + zsetConvert(o, OBJ_ENCODING_SKIPLIST); break; case RDB_TYPE_HASH_ZIPLIST: { @@ -2426,12 +2462,12 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { o->ptr = lp; o->type = OBJ_HASH; o->encoding = OBJ_ENCODING_LISTPACK; - if (hashTypeLength(o) == 0) { + if (hashTypeLength(o, 0) == 0) { decrRefCount(o); goto emptykey; } - if (hashTypeLength(o) > server.hash_max_listpack_entries) + if (hashTypeLength(o, 0) > server.hash_max_listpack_entries) hashTypeConvert(o, OBJ_ENCODING_HT); else o->ptr = lpShrinkToFit(o->ptr); @@ -2448,12 +2484,12 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } o->type = OBJ_HASH; o->encoding = OBJ_ENCODING_LISTPACK; - if (hashTypeLength(o) == 0) { + if (hashTypeLength(o, 0) == 0) { decrRefCount(o); goto emptykey; } - if (hashTypeLength(o) > server.hash_max_listpack_entries) + if (hashTypeLength(o, 0) > server.hash_max_listpack_entries) hashTypeConvert(o, OBJ_ENCODING_HT); break; default: @@ -2540,7 +2576,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { /* Load the last entry ID. */ s->last_id.ms = rdbLoadLen(rdb,NULL); s->last_id.seq = rdbLoadLen(rdb,NULL); - + if (rdbtype >= RDB_TYPE_STREAM_LISTPACKS_2) { /* Load the first entry ID. */ s->first_id.ms = rdbLoadLen(rdb,NULL); @@ -2559,9 +2595,9 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { s->max_deleted_entry_id.ms = 0; s->max_deleted_entry_id.seq = 0; s->entries_added = s->length; - + /* Since the rax is already loaded, we can find the first entry's - * ID. */ + * ID. */ streamGetEdgeID(s,1,1,&s->first_id); } @@ -2807,7 +2843,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { uint64_t eof = rdbLoadLen(rdb,NULL); if (eof == RDB_LENERR) { if (ptr) { - o = createModuleObject(mt,ptr); /* creating just in order to easily destroy */ + o = createModuleObject(mt, ptr); /* creating just in order to easily destroy */ decrRefCount(o); } return NULL; @@ -2816,7 +2852,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { rdbReportCorruptRDB("The RDB file contains module data for the module '%s' that is not terminated by " "the proper module value EOF marker", moduleTypeModuleName(mt)); if (ptr) { - o = createModuleObject(mt,ptr); /* creating just in order to easily destroy */ + o = createModuleObject(mt, ptr); /* creating just in order to easily destroy */ decrRefCount(o); } return NULL; @@ -2828,7 +2864,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { moduleTypeModuleName(mt)); return NULL; } - o = createModuleObject(mt,ptr); + o = createModuleObject(mt, ptr); } else { rdbReportReadError("Unknown RDB encoding type %d",rdbtype); return NULL; @@ -3256,8 +3292,8 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin * received from the master. In the latter case, the master is * responsible for key expiry. If we would expire keys here, the * snapshot taken by the master may not be reflected on the slave. - * Similarly, if the base AOF is RDB format, we want to load all - * the keys they are, since the log of operations in the incr AOF + * Similarly, if the base AOF is RDB format, we want to load all + * the keys they are, since the log of operations in the incr AOF * is assumed to work in the exact keyspace state. */ if (val == NULL) { /* Since we used to have bug that could lead to empty keys diff --git a/src/rdb.h b/src/rdb.h index 8ce2aaaf7..02bb5e347 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -105,6 +105,7 @@ #define RDB_LOAD_ENC (1<<0) #define RDB_LOAD_PLAIN (1<<1) #define RDB_LOAD_SDS (1<<2) +#define RDB_LOAD_HFLD (1<<3) /* flags on the purpose of rdb save or load */ #define RDBFLAGS_NONE 0 /* No special RDB loading or saving. */ diff --git a/src/server.c b/src/server.c index 515105c3f..e0366fcea 100644 --- a/src/server.c +++ b/src/server.c @@ -19,6 +19,8 @@ #include "syscheck.h" #include "threads_mngr.h" #include "fmtargs.h" +#include "mstr.h" +#include "ebuckets.h" #include #include @@ -281,6 +283,18 @@ int dictSdsKeyCompare(dict *d, const void *key1, return memcmp(key1, key2, l1) == 0; } +int dictSdsMstrKeyCompare(dict *d, const void *sdsLookup, const void *mstrStored) +{ + int l1,l2; + UNUSED(d); + + l1 = sdslen((sds)sdsLookup); + l2 = hfieldlen((hfield)mstrStored); + if (l1 != l2) return 0; + return memcmp(sdsLookup, mstrStored, l1) == 0; +} + + /* A case insensitive version used for the command lookup table and other * places where case insensitive non binary-safe comparison is needed. */ int dictSdsKeyCaseCompare(dict *d, const void *key1, @@ -2500,6 +2514,7 @@ void resetServerStats(void) { server.stat_numcommands = 0; server.stat_numconnections = 0; server.stat_expiredkeys = 0; + server.stat_expired_hash_fields = 0; server.stat_expired_stale_perc = 0; server.stat_expired_time_cap_reached_count = 0; server.stat_expire_cycle_time_used = 0; @@ -2648,6 +2663,7 @@ void initServer(void) { for (j = 0; j < server.dbnum; j++) { server.db[j].keys = kvstoreCreate(&dbDictType, slot_count_bits, flags); server.db[j].expires = kvstoreCreate(&dbExpiresDictType, slot_count_bits, flags); + server.db[j].hexpires = ebCreate(); server.db[j].expires_cursor = 0; server.db[j].blocking_keys = dictCreate(&keylistDictType); server.db[j].blocking_keys_unblock_on_nokey = dictCreate(&objectKeyPointerValueDictType); @@ -5849,6 +5865,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { "sync_full:%lld\r\n", server.stat_sync_full, "sync_partial_ok:%lld\r\n", server.stat_sync_partial_ok, "sync_partial_err:%lld\r\n", server.stat_sync_partial_err, + "expired_hash_fields:%lld\r\n", server.stat_expired_hash_fields, "expired_keys:%lld\r\n", server.stat_expiredkeys, "expired_stale_perc:%.2f\r\n", server.stat_expired_stale_perc*100, "expired_time_cap_reached_count:%lld\r\n", server.stat_expired_time_cap_reached_count, @@ -6862,9 +6879,11 @@ struct redisTest { {"crc64", crc64Test}, {"zmalloc", zmalloc_test}, {"sds", sdsTest}, + {"mstr", mstrTest}, {"dict", dictTest}, {"listpack", listpackTest}, {"kvstore", kvstoreTest}, + {"ebuckets", ebucketsTest}, }; redisTestProc *getTestProcByName(const char *name) { int numtests = sizeof(redisTests)/sizeof(struct redisTest); @@ -6891,6 +6910,7 @@ int main(int argc, char **argv) { if (!strcasecmp(arg, "--accurate")) flags |= REDIS_TEST_ACCURATE; else if (!strcasecmp(arg, "--large-memory")) flags |= REDIS_TEST_LARGE_MEMORY; else if (!strcasecmp(arg, "--valgrind")) flags |= REDIS_TEST_VALGRIND; + else if (!strcasecmp(arg, "--verbose")) flags |= REDIS_TEST_VERBOSE; } if (!strcasecmp(argv[2], "all")) { diff --git a/src/server.h b/src/server.h index 4c10ca1a7..bca651ba5 100644 --- a/src/server.h +++ b/src/server.h @@ -45,6 +45,8 @@ typedef long long ustime_t; /* microsecond time type. */ #include "ae.h" /* Event driven programming library */ #include "sds.h" /* Dynamic safe strings */ +#include "mstr.h" /* Immutable strings with optional metadata attached */ +#include "ebuckets.h" /* expiry data structure */ #include "dict.h" /* Hash tables */ #include "kvstore.h" /* Slot-based hash table */ #include "adlist.h" /* Linked lists */ @@ -960,6 +962,7 @@ typedef struct replBufBlock { typedef struct redisDb { kvstore *keys; /* The keyspace for this DB */ kvstore *expires; /* Timeout of keys with a timeout set */ + ebuckets hexpires; /* Hash expiration DS. Single TTL per hash (of next min field to expire) */ dict *blocking_keys; /* Keys with clients waiting for data (BLPOP)*/ dict *blocking_keys_unblock_on_nokey; /* Keys with clients waiting for * data, and should be unblocked if key is deleted (XREADEDGROUP). @@ -1642,6 +1645,7 @@ struct redisServer { long long stat_numcommands; /* Number of processed commands */ long long stat_numconnections; /* Number of connections received */ long long stat_expiredkeys; /* Number of expired keys */ + long long stat_expired_hash_fields; /* Number of expired hash-fields */ double stat_expired_stale_perc; /* Percentage of keys probably expired */ long long stat_expired_time_cap_reached_count; /* Early expire cycle stops.*/ long long stat_expire_cycle_time_used; /* Cumulative microseconds used. */ @@ -2444,6 +2448,10 @@ typedef struct { #define IO_THREADS_OP_WRITE 2 extern int io_threads_op; +/* Hash-field data type (of t_hash.c) */ +typedef mstr hfield; +extern mstrKind mstrFieldKind; + /*----------------------------------------------------------------------------- * Extern declarations *----------------------------------------------------------------------------*/ @@ -2458,6 +2466,8 @@ extern dictType zsetDictType; extern dictType dbDictType; extern double R_Zero, R_PosInf, R_NegInf, R_Nan; extern dictType hashDictType; +extern dictType mstrHashDictType; +extern dictType mstrHashDictTypeWithHFE; extern dictType stringSetDictType; extern dictType externalStringType; extern dictType sdsHashDictType; @@ -2469,6 +2479,9 @@ extern dictType sdsReplyDictType; extern dictType keylistDictType; extern dict *modules; +extern EbucketsType hashExpireBucketsType; /* global expires */ +extern EbucketsType hashFieldExpiresBucketType; /* local per hash */ + /*----------------------------------------------------------------------------- * Functions prototypes *----------------------------------------------------------------------------*/ @@ -2611,6 +2624,7 @@ void copyReplicaOutputBuffer(client *dst, client *src); void addListRangeReply(client *c, robj *o, long start, long end, int reverse); void deferredAfterErrorReply(client *c, list *errors); size_t sdsZmallocSize(sds s); +size_t hfieldZmallocSize(hfield s); size_t getStringObjectSdsUsedMemory(robj *o); void freeClientReplyValue(void *o); void *dupClientReplyValue(void *o); @@ -3144,21 +3158,35 @@ void hashTypeConvert(robj *o, int enc); void hashTypeTryConversion(robj *subject, robj **argv, int start, int end); int hashTypeExists(robj *o, sds key); int hashTypeDelete(robj *o, sds key); -unsigned long hashTypeLength(const robj *o); +unsigned long hashTypeLength(const robj *o, int subtractExpiredFields); hashTypeIterator *hashTypeInitIterator(robj *subject); void hashTypeReleaseIterator(hashTypeIterator *hi); -int hashTypeNext(hashTypeIterator *hi); +int hashTypeNext(hashTypeIterator *hi, int skipExpiredFields); void hashTypeCurrentFromListpack(hashTypeIterator *hi, int what, unsigned char **vstr, unsigned int *vlen, long long *vll); -sds hashTypeCurrentFromHashTable(hashTypeIterator *hi, int what); -void hashTypeCurrentObject(hashTypeIterator *hi, int what, unsigned char **vstr, unsigned int *vlen, long long *vll); +void hashTypeCurrentFromHashTable(hashTypeIterator *hi, int what, char **str, + size_t *len, uint64_t *expireTime); +void hashTypeCurrentObject(hashTypeIterator *hi, int what, unsigned char **vstr, + unsigned int *vlen, long long *vll, uint64_t *expireTime); sds hashTypeCurrentObjectNewSds(hashTypeIterator *hi, int what); -robj *hashTypeLookupWriteOrCreate(client *c, robj *key); +hfield hashTypeCurrentObjectNewHfield(hashTypeIterator *hi); robj *hashTypeGetValueObject(robj *o, sds field); -int hashTypeSet(robj *o, sds field, sds value, int flags); -robj *hashTypeDup(robj *o); +int hashTypeSet(redisDb *db, robj *o, sds field, sds value, int flags); +robj *hashTypeDup(robj *o, sds newkey, uint64_t *minHashExpire); +uint64_t hashTypeRemoveFromExpires(ebuckets *hexpires, robj *o); +void hashTypeAddToExpires(redisDb *db, sds key, robj *hashObj, uint64_t expireTime); +int64_t hashTypeGetMinExpire(robj *keyObj); + +/* Hash-Field data type (of t_hash.c) */ +hfield hfieldNew(const void *field, size_t fieldlen, int withExpireMeta); +hfield hfieldTryNew(const void *field, size_t fieldlen, int withExpireMeta); +int hfieldIsExpireAttached(hfield field); +int hfieldIsExpired(hfield field); +static inline void hfieldFree(hfield field) { mstrFree(&mstrFieldKind, field); } +static inline void *hfieldGetAllocPtr(hfield field) { return mstrGetAllocPtr(&mstrFieldKind, field); } +static inline size_t hfieldlen(hfield field) { return mstrlen(field);} /* Pub / Sub */ int pubsubUnsubscribeAllChannels(client *c, int notify); @@ -3177,7 +3205,7 @@ dict *getClientPubSubChannels(client *c); dict *getClientPubSubShardChannels(client *c); /* Keyspace events notification */ -void notifyKeyspaceEvent(int type, char *event, robj *key, int dbid); +void notifyKeyspaceEvent(int type, const char *event, robj *key, int dbid); int keyspaceEventsStringToFlags(char *classes); sds keyspaceEventsFlagsToString(int flags); @@ -3261,6 +3289,7 @@ int keyIsExpired(redisDb *db, robj *key); long long getExpire(redisDb *db, robj *key); void setExpire(client *c, redisDb *db, robj *key, long long when); int checkAlreadyExpired(long long when); +int parseExtendedExpireArgumentsOrReply(client *c, int *flags); robj *lookupKeyRead(redisDb *db, robj *key); robj *lookupKeyWrite(redisDb *db, robj *key); robj *lookupKeyReadOrReply(client *c, robj *key, robj *reply); @@ -3279,7 +3308,7 @@ int objectSetLRUOrLFU(robj *val, long long lfu_freq, long long lru_idle, #define LOOKUP_NOEXPIRE (1<<4) /* Avoid deleting lazy expired keys. */ #define LOOKUP_NOEFFECTS (LOOKUP_NONOTIFY | LOOKUP_NOSTATS | LOOKUP_NOTOUCH | LOOKUP_NOEXPIRE) /* Avoid any effects from fetching the key */ -void dbAdd(redisDb *db, robj *key, robj *val); +dictEntry *dbAdd(redisDb *db, robj *key, robj *val); int dbAddRDBLoad(redisDb *db, sds key, robj *val); void dbReplaceValue(redisDb *db, robj *key, robj *val); @@ -3434,6 +3463,7 @@ void expireSlaveKeys(void); void rememberSlaveKeyWithExpire(redisDb *db, robj *key); void flushSlaveKeysWithExpireList(void); size_t getSlaveKeyWithExpireCount(void); +uint64_t hashTypeDbActiveExpire(redisDb *db, uint32_t maxFieldsToExpire); /* evict.c -- maxmemory handling and LRU eviction. */ void evictionPoolAlloc(void); @@ -3451,6 +3481,7 @@ void startEvictionTimeProc(void); uint64_t dictSdsHash(const void *key); uint64_t dictSdsCaseHash(const void *key); int dictSdsKeyCompare(dict *d, const void *key1, const void *key2); +int dictSdsMstrKeyCompare(dict *d, const void *sdsLookup, const void *mstrStored); int dictSdsKeyCaseCompare(dict *d, const void *key1, const void *key2); void dictSdsDestructor(dict *d, void *val); void dictListDestructor(dict *d, void *val); @@ -3606,6 +3637,15 @@ void strlenCommand(client *c); void zrankCommand(client *c); void zrevrankCommand(client *c); void hsetCommand(client *c); +void hpexpireCommand(client *c); +void hexpireCommand(client *c); +void hpexpireatCommand(client *c); +void hexpireatCommand(client *c); +void httlCommand(client *c); +void hpttlCommand(client *c); +void hexpiretimeCommand(client *c); +void hpexpiretimeCommand(client *c); +void hpersistCommand(client *c); void hsetnxCommand(client *c); void hgetCommand(client *c); void hmgetCommand(client *c); diff --git a/src/t_hash.c b/src/t_hash.c index 8c4c21b0d..65eec19b2 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -7,8 +7,208 @@ */ #include "server.h" +#include "ebuckets.h" #include +/* Threshold for HEXPIRE and HPERSIST to be considered whether it is worth to + * update the expiration time of the hash object in global HFE DS. */ +#define HASH_NEW_EXPIRE_DIFF_THRESHOLD max(4000, 1<hexpires) to register hashes that have one or more fields with time-Expiration. + * The hashes will be registered in with the expiration time of the earliest field + * in the hash. + *----------------------------------------------------------------------------*/ +EbucketsType hashExpireBucketsType = { + .onDeleteItem = NULL, + .getExpireMeta = hashGetExpireMeta, /* get ExpireMeta attached to each hash */ + .itemsAddrAreOdd = 0, /* Addresses of dict are even */ +}; + +/* dictExpireMetadata - ebuckets-type for hash fields with time-Expiration. ebuckets + * instance Will be attached to each hash that has at least one field with expiry + * time. */ +EbucketsType hashFieldExpireBucketsType = { + .onDeleteItem = NULL, + .getExpireMeta = hfieldGetExpireMeta, /* get ExpireMeta attached to each field */ + .itemsAddrAreOdd = 1, /* Addresses of hfield (mstr) are odd!! */ +}; + +/* Each dict of hash object that has fields with time-Expiration will have the + * following metadata attached to dict header */ +typedef struct dictExpireMetadata { + ExpireMeta expireMeta; /* embedded ExpireMeta in dict. + To be used in order to register the hash in the + global ebuckets (i.e db->hexpires) with next, + minimum, hash-field to expire */ + ebuckets hfe; /* DS of Hash Fields Expiration, associated to each hash */ + sds key; /* reference to the key, same one that stored in + db->dict. Will be used from active-expiration flow + for notification and deletion of the object, if + needed. */ +} dictExpireMetadata; + +/* ActiveExpireCtx passed to hashTypeActiveExpire() */ +typedef struct ActiveExpireCtx { + uint32_t fieldsToExpireQuota; + redisDb *db; +} ActiveExpireCtx; + +/* The implementation of hashes by dict was modified from storing fields as sds + * strings to store "mstr" (Immutable string with metadata) in order to be able to + * attach TTL (ExpireMeta) to the hash-field. This usage of mstr opens up the + * opportunity for future features to attach additional metadata by need to the + * fields. + * + * The following defines new hfield kind of mstr */ +typedef enum HfieldMetaFlags { + HFIELD_META_EXPIRE = 0, +} HfieldMetaFlags; + +mstrKind mstrFieldKind = { + .name = "hField", + + /* Taking care that all metaSize[*] values are even ensures that all + * addresses of hfield instances will be odd. */ + .metaSize[HFIELD_META_EXPIRE] = sizeof(ExpireMeta), +}; +static_assert(sizeof(struct ExpireMeta ) % 2 == 0, "must be even!"); + +/* Used by hpersistCommand() */ +typedef enum SetPersistRes { + HFE_PERSIST_NO_FIELD = -2, /* No such hash-field */ + HFE_PERSIST_NO_TTL = -1, /* No TTL attached to the field */ + HFE_PERSIST_OK = 1 +} SetPersistRes; + +/* Used by hashTypeSetExpire() */ +typedef enum SetExpireTimeRes { + HFE_SET_NO_FIELD = -2, /* No such hash-field */ + HFE_SET_NO_CONDITION_MET = 0, /* Specified NX | XX | GT | LT condition not met */ + HFE_SET_OK = 1, /* Expiration time set/updated as expected */ + HFE_SET_DELETED = 2 /* Field deleted because the specified time is in the past */ +} SetExpireTimeRes; + +/* Used by httlGenericCommand() */ +typedef enum GetExpireTimeRes { + HFE_GET_NO_FIELD = -2, /* No such hash-field */ + HFE_GET_NO_TTL = -1, /* No TTL attached to the field */ +} GetExpireTimeRes; + +#define HFE_NX (1<<0) +#define HFE_XX (1<<1) +#define HFE_GT (1<<2) +#define HFE_LT (1<<3) + +static inline int isDictWithMetaHFE(dict *d) { + return d->type == &mstrHashDictTypeWithHFE; +} + +/*----------------------------------------------------------------------------- + * Accessor functions for dictType of hash + *----------------------------------------------------------------------------*/ + +static int dictHfieldKeyCompare(dict *d, const void *key1, const void *key2) +{ + int l1,l2; + UNUSED(d); + + l1 = hfieldlen((hfield)key1); + l2 = hfieldlen((hfield)key2); + if (l1 != l2) return 0; + return memcmp(key1, key2, l1) == 0; +} + +static uint64_t dictMstrHash(const void *key) { + return dictGenHashFunction((unsigned char*)key, mstrlen((char*)key)); +} + +static void dictHfieldDestructor(dict *d, void *field) { + /* If attached TTL to the field, then remove it from hash's private ebuckets. */ + if (hfieldGetExpireTime(field) != EB_EXPIRE_TIME_INVALID) { + dictExpireMetadata *dictExpireMeta = (dictExpireMetadata *) dictMetadata(d); + ebRemove(&dictExpireMeta->hfe, &hashFieldExpireBucketsType, field); + + // TODO: Check if the field is the minimum in the hash and update the global HFE DS + } + + hfieldFree(field); +} + +static size_t hashDictWithExpireMetadataBytes(dict *d) { + UNUSED(d); + /* expireMeta of the hash, ref to ebuckets and pointer to hash's key */ + return sizeof(dictExpireMetadata); +} + +static void hashDictWithExpireOnRelease(dict *d) { + /* for sure allocated with metadata. Otherwise, this func won't be registered */ + dictExpireMetadata *dictExpireMeta = (dictExpireMetadata *) dictMetadata(d); + ebDestroy(&dictExpireMeta->hfe, &hashFieldExpireBucketsType, NULL); +} + /*----------------------------------------------------------------------------- * Hash type API *----------------------------------------------------------------------------*/ @@ -85,7 +285,12 @@ sds hashTypeGetFromHashTable(robj *o, sds field) { serverAssert(o->encoding == OBJ_ENCODING_HT); de = dictFind(o->ptr, field); + if (de == NULL) return NULL; + + /* Check if the field is expired */ + if (hfieldIsExpired(dictGetKey(de))) return NULL; + return dictGetVal(de); } @@ -176,7 +381,7 @@ int hashTypeExists(robj *o, sds field) { #define HASH_SET_TAKE_FIELD (1<<0) #define HASH_SET_TAKE_VALUE (1<<1) #define HASH_SET_COPY 0 -int hashTypeSet(robj *o, sds field, sds value, int flags) { +int hashTypeSet(redisDb *db, robj *o, sds field, sds value, int flags) { int update = 0; /* Check if the field is too long for listpack, and convert before adding the item. @@ -186,7 +391,7 @@ int hashTypeSet(robj *o, sds field, sds value, int flags) { if (sdslen(field) > server.hash_max_listpack_value || sdslen(value) > server.hash_max_listpack_value) hashTypeConvert(o, OBJ_ENCODING_HT); } - + if (o->encoding == OBJ_ENCODING_LISTPACK) { unsigned char *zl, *fptr, *vptr; @@ -213,30 +418,36 @@ int hashTypeSet(robj *o, sds field, sds value, int flags) { o->ptr = zl; /* Check if the listpack needs to be converted to a hash table */ - if (hashTypeLength(o) > server.hash_max_listpack_entries) + if (hashTypeLength(o, 0) > server.hash_max_listpack_entries) hashTypeConvert(o, OBJ_ENCODING_HT); } else if (o->encoding == OBJ_ENCODING_HT) { dict *ht = o->ptr; - dictEntry *de, *existing; - sds v; + dictEntry *de, *existingEntry; + sds storedValue; if (flags & HASH_SET_TAKE_VALUE) { - v = value; + storedValue = value; value = NULL; } else { - v = sdsdup(value); + storedValue = sdsdup(value); } - de = dictAddRaw(ht, field, &existing); + /* Cannot leverage HASH_SET_TAKE_FIELD since hfield is not of type sds */ + hfield newField = hfieldNew(field, sdslen(field), 0); + + /* stored key is different than lookup key */ + dictUseStoredKeyApi(ht, 1); + de = dictAddRaw(ht, newField, &existingEntry); + dictUseStoredKeyApi(ht, 0); + if (de) { - dictSetVal(ht, de, v); - if (flags & HASH_SET_TAKE_FIELD) { - field = NULL; - } else { - dictSetKey(ht, de, sdsdup(field)); - } + dictSetVal(ht, de, storedValue); } else { - sdsfree(dictGetVal(existing)); - dictSetVal(ht, existing, v); + /* If attached TTL to the old field, then remove it from hash's private ebuckets */ + hfield oldField = dictGetKey(existingEntry); + hfieldPersist(db, o, oldField); + sdsfree(dictGetVal(existingEntry)); + dictSetVal(ht, existingEntry, storedValue); update = 1; + hfieldFree(newField); } } else { serverPanic("Unknown hash encoding"); @@ -269,6 +480,7 @@ int hashTypeDelete(robj *o, sds field) { } } } else if (o->encoding == OBJ_ENCODING_HT) { + /* dictDelete() will call dictHfieldDestructor() */ if (dictDelete((dict*)o->ptr, field) == C_OK) { deleted = 1; } @@ -279,14 +491,27 @@ int hashTypeDelete(robj *o, sds field) { return deleted; } -/* Return the number of elements in a hash. */ -unsigned long hashTypeLength(const robj *o) { +/* Return the number of elements in a hash. + * + * Note: Might be pricy in case there are many HFEs + */ +unsigned long hashTypeLength(const robj *o, int subtractExpiredFields) { unsigned long length = ULONG_MAX; if (o->encoding == OBJ_ENCODING_LISTPACK) { length = lpLength(o->ptr) / 2; } else if (o->encoding == OBJ_ENCODING_HT) { - length = dictSize((const dict*)o->ptr); + uint64_t expiredItems = 0; + dict *d = (dict*)o->ptr; + if (subtractExpiredFields && isDictWithMetaHFE(d)) { + dictExpireMetadata *meta = (dictExpireMetadata *) dictMetadata(d); + /* If dict registered in global HFE DS */ + if (meta->expireMeta.trash == 0) + expiredItems = ebExpireDryRun(meta->hfe, + &hashFieldExpireBucketsType, + commandTimeSnapshot()); + } + length = dictSize(d) - expiredItems; } else { serverPanic("Unknown hash encoding"); } @@ -317,7 +542,7 @@ void hashTypeReleaseIterator(hashTypeIterator *hi) { /* Move to the next entry in the hash. Return C_OK when the next entry * could be found and C_ERR when the iterator reaches the end. */ -int hashTypeNext(hashTypeIterator *hi) { +int hashTypeNext(hashTypeIterator *hi, int skipExpiredFields) { if (hi->encoding == OBJ_ENCODING_LISTPACK) { unsigned char *zl; unsigned char *fptr, *vptr; @@ -326,6 +551,8 @@ int hashTypeNext(hashTypeIterator *hi) { fptr = hi->fptr; vptr = hi->vptr; + /* TODO-HFE: Handle skipExpiredFields for listpack */ + if (fptr == NULL) { /* Initialize cursor */ serverAssert(vptr == NULL); @@ -345,7 +572,12 @@ int hashTypeNext(hashTypeIterator *hi) { hi->fptr = fptr; hi->vptr = vptr; } else if (hi->encoding == OBJ_ENCODING_HT) { - if ((hi->de = dictNext(hi->di)) == NULL) return C_ERR; + while ((hi->de = dictNext(hi->di)) != NULL) { + if (skipExpiredFields && hfieldIsExpired(dictGetKey(hi->de))) + continue; + return C_OK; + } + return C_ERR; } else { serverPanic("Unknown hash encoding"); } @@ -370,15 +602,30 @@ void hashTypeCurrentFromListpack(hashTypeIterator *hi, int what, /* Get the field or value at iterator cursor, for an iterator on a hash value * encoded as a hash table. Prototype is similar to - * `hashTypeGetFromHashTable`. */ -sds hashTypeCurrentFromHashTable(hashTypeIterator *hi, int what) { + * `hashTypeGetFromHashTable`. + * + * expireTime - If parameter is not null, then the function will return the expire + * time of the field. If expiry not set, return EB_EXPIRE_TIME_INVALID + */ +void hashTypeCurrentFromHashTable(hashTypeIterator *hi, int what, char **str, size_t *len, uint64_t *expireTime) { serverAssert(hi->encoding == OBJ_ENCODING_HT); + hfield key = NULL; if (what & OBJ_HASH_KEY) { - return dictGetKey(hi->de); + key = dictGetKey(hi->de); + *str = key; + *len = hfieldlen(key); } else { - return dictGetVal(hi->de); + sds val = dictGetVal(hi->de); + *str = val; + *len = sdslen(val); } + + if (expireTime) { + if (!key) key = dictGetKey(hi->de); + *expireTime = hfieldGetExpireTime( key ); + } + } /* Higher level function of hashTypeCurrent*() that returns the hash value @@ -391,14 +638,23 @@ sds hashTypeCurrentFromHashTable(hashTypeIterator *hi, int what) { * If *vll is populated *vstr is set to NULL, so the caller * can always check the function return by checking the return value * type checking if vstr == NULL. */ -void hashTypeCurrentObject(hashTypeIterator *hi, int what, unsigned char **vstr, unsigned int *vlen, long long *vll) { +void hashTypeCurrentObject(hashTypeIterator *hi, + int what, + unsigned char **vstr, + unsigned int *vlen, + long long *vll, + uint64_t *expireTime) +{ if (hi->encoding == OBJ_ENCODING_LISTPACK) { *vstr = NULL; hashTypeCurrentFromListpack(hi, what, vstr, vlen, vll); + /* TODO-HFE: Handle expireTime */ } else if (hi->encoding == OBJ_ENCODING_HT) { - sds ele = hashTypeCurrentFromHashTable(hi, what); + char *ele; + size_t eleLen; + hashTypeCurrentFromHashTable(hi, what, &ele, &eleLen, expireTime); *vstr = (unsigned char*) ele; - *vlen = sdslen(ele); + *vlen = eleLen; } else { serverPanic("Unknown hash encoding"); } @@ -411,12 +667,31 @@ sds hashTypeCurrentObjectNewSds(hashTypeIterator *hi, int what) { unsigned int vlen; long long vll; - hashTypeCurrentObject(hi,what,&vstr,&vlen,&vll); + hashTypeCurrentObject(hi,what,&vstr,&vlen,&vll, NULL); if (vstr) return sdsnewlen(vstr,vlen); return sdsfromlonglong(vll); } -robj *hashTypeLookupWriteOrCreate(client *c, robj *key) { +/* Return the key at the current iterator position as a new hfield string. */ +hfield hashTypeCurrentObjectNewHfield(hashTypeIterator *hi) { + char buf[LONG_STR_SIZE]; + unsigned char *vstr; + unsigned int vlen; + long long vll; + hfield hf; + + hashTypeCurrentObject(hi,OBJ_HASH_KEY,&vstr,&vlen,&vll, NULL); + + if (!vstr) { + vlen = ll2string(buf, sizeof(buf), vll); + vstr = (unsigned char *) buf; + } + + hf = hfieldNew(vstr,vlen, 0); + return hf; +} + +static robj *hashTypeLookupWriteOrCreate(client *c, robj *key) { robj *o = lookupKeyWrite(c->db,key); if (checkType(c,o,OBJ_HASH)) return NULL; @@ -440,19 +715,21 @@ void hashTypeConvertListpack(robj *o, int enc) { int ret; hi = hashTypeInitIterator(o); - dict = dictCreate(&hashDictType); + dict = dictCreate(&mstrHashDictType); /* Presize the dict to avoid rehashing */ - dictExpand(dict,hashTypeLength(o)); + /* TODO: activeExpire list pack. Should be small */ + dictExpand(dict,hashTypeLength(o, 0)); - while (hashTypeNext(hi) != C_ERR) { - sds key, value; + while (hashTypeNext(hi, 0) != C_ERR) { - key = hashTypeCurrentObjectNewSds(hi,OBJ_HASH_KEY); - value = hashTypeCurrentObjectNewSds(hi,OBJ_HASH_VALUE); + hfield key = hashTypeCurrentObjectNewHfield(hi); + sds value = hashTypeCurrentObjectNewSds(hi,OBJ_HASH_VALUE); + dictUseStoredKeyApi(dict, 1); ret = dictAdd(dict, key, value); + dictUseStoredKeyApi(dict, 0); if (ret != DICT_OK) { - sdsfree(key); sdsfree(value); /* Needed for gcc ASAN */ + hfieldFree(key); sdsfree(value); /* Needed for gcc ASAN */ hashTypeReleaseIterator(hi); /* Needed for gcc ASAN */ serverLogHexDump(LL_WARNING,"listpack with dup elements dump", o->ptr,lpBytes(o->ptr)); @@ -483,7 +760,7 @@ void hashTypeConvert(robj *o, int enc) { * has the same encoding as the original one. * * The resulting object always has refcount set to 1 */ -robj *hashTypeDup(robj *o) { +robj *hashTypeDup(robj *o, sds newkey, uint64_t *minHashExpire) { robj *hobj; hashTypeIterator *hi; @@ -496,22 +773,51 @@ robj *hashTypeDup(robj *o) { memcpy(new_zl, zl, sz); hobj = createObject(OBJ_HASH, new_zl); hobj->encoding = OBJ_ENCODING_LISTPACK; - } else if(o->encoding == OBJ_ENCODING_HT){ - dict *d = dictCreate(&hashDictType); + } else if(o->encoding == OBJ_ENCODING_HT) { + dictExpireMetadata *dictExpireMetaSrc, *dictExpireMetaDst = NULL; + dict *d; + + /* If dict doesn't have HFE metadata, then create a new dict without it */ + if (!isDictWithMetaHFE(o->ptr)) { + d = dictCreate(&mstrHashDictType); + } else { + /* Create a new dict with HFE metadata */ + d = dictCreate(&mstrHashDictTypeWithHFE); + dictExpireMetaSrc = (dictExpireMetadata *) dictMetadata((dict *) o->ptr); + dictExpireMetaDst = (dictExpireMetadata *) dictMetadata(d); + dictExpireMetaDst->key = newkey; /* reference key in keyspace */ + dictExpireMetaDst->hfe = ebCreate(); /* Allocate HFE DS */ + dictExpireMetaDst->expireMeta.trash = 1; /* mark as trash (as long it wasn't ebAdd()) */ + + /* Extract the minimum expire time of the source hash (Will be used by caller + * to register the new hash in the global ebuckets, i.e db->hexpires) */ + if (dictExpireMetaSrc->expireMeta.trash == 0) + *minHashExpire = ebGetMetaExpTime(&dictExpireMetaSrc->expireMeta); + } dictExpand(d, dictSize((const dict*)o->ptr)); hi = hashTypeInitIterator(o); - while (hashTypeNext(hi) != C_ERR) { - sds field, value; + while (hashTypeNext(hi, 0) != C_ERR) { + uint64_t expireTime; sds newfield, newvalue; /* Extract a field-value pair from an original hash object.*/ - field = hashTypeCurrentFromHashTable(hi, OBJ_HASH_KEY); - value = hashTypeCurrentFromHashTable(hi, OBJ_HASH_VALUE); - newfield = sdsdup(field); - newvalue = sdsdup(value); + char *field, *value; + size_t fieldLen, valueLen; + hashTypeCurrentFromHashTable(hi, OBJ_HASH_KEY, &field, &fieldLen, &expireTime); + if (expireTime == EB_EXPIRE_TIME_INVALID) { + newfield = hfieldNew(field, fieldLen, 0); + } else { + newfield = hfieldNew(field, fieldLen, 1); + ebAdd(&dictExpireMetaDst->hfe, &hashFieldExpireBucketsType, newfield, expireTime); + } + + hashTypeCurrentFromHashTable(hi, OBJ_HASH_VALUE, &value, &valueLen, NULL); + newvalue = sdsnewlen(value, valueLen); /* Add a field-value pair to a new hash object. */ + dictUseStoredKeyApi(d, 1); dictAdd(d,newfield,newvalue); + dictUseStoredKeyApi(d, 0); } hashTypeReleaseIterator(hi); @@ -543,9 +849,9 @@ void hashReplyFromListpackEntry(client *c, listpackEntry *e) { void hashTypeRandomElement(robj *hashobj, unsigned long hashsize, listpackEntry *key, listpackEntry *val) { if (hashobj->encoding == OBJ_ENCODING_HT) { dictEntry *de = dictGetFairRandomKey(hashobj->ptr); - sds s = dictGetKey(de); - key->sval = (unsigned char*)s; - key->slen = sdslen(s); + hfield field = dictGetKey(de); + key->sval = (unsigned char*)field; + key->slen = hfieldlen(field); if (val) { sds s = dictGetVal(de); val->sval = (unsigned char*)s; @@ -558,6 +864,161 @@ void hashTypeRandomElement(robj *hashobj, unsigned long hashsize, listpackEntry } } +/* + * Active expiration of fields in hash + * + * Called by hashTypeDbActiveExpire() for each hash registered in the HFE DB + * (db->hexpires) with an expiration-time less than or equal current time. + * + * This callback performs the following actions for each hash: + * - Delete expired fields as by calling ebExpire(hash) + * - If afterward there are future fields to expire, it will update the hash in + * HFE DB with the next hash-field minimum expiration time by returning + * ACT_UPDATE_EXP_ITEM. + * - If the hash has no more fields to expire, it is removed from the HFE DB + * by returning ACT_REMOVE_EXP_ITEM. + * - If hash has no more fields afterward, it will remove the hash from keyspace. + */ +static ExpireAction hashTypeActiveExpire(eItem _hashObj, void *ctx) { + robj *hashObj = (robj *) _hashObj; + ActiveExpireCtx *activeExpireCtx = (ActiveExpireCtx *) ctx; + + /* If no more quota left for this callback, stop */ + if (activeExpireCtx->fieldsToExpireQuota == 0) + return ACT_STOP_ACTIVE_EXP; + + if (hashObj->encoding == OBJ_ENCODING_LISTPACK) { + serverPanic("Listpack encoding not supported yet"); + } + serverAssert(hashObj->encoding == OBJ_ENCODING_HT); + + dict *d = hashObj->ptr; + dictExpireMetadata *dictExpireMeta = (dictExpireMetadata *) dictMetadata(d); + + ExpireInfo info = { + .maxToExpire = activeExpireCtx->fieldsToExpireQuota, + .onExpireItem = onFieldExpire, + .ctx = hashObj, + .now = commandTimeSnapshot(), + .itemsExpired = 0 + }; + + ebExpire(&dictExpireMeta->hfe, &hashFieldExpireBucketsType, &info); + + /* Update quota left */ + activeExpireCtx->fieldsToExpireQuota -= info.itemsExpired; + + /* If hash has no more fields to expire, remove it from HFE DB */ + if (info.nextExpireTime == 0) { + if (hashTypeLength(hashObj, 0) == 0) { + robj *key = createStringObject(dictExpireMeta->key, sdslen(dictExpireMeta->key)); + dbDelete(activeExpireCtx->db, key); + //notifyKeyspaceEvent(NOTIFY_HASH,"xxxxxxxxx",c->argv[1],c->db->id); + notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key, activeExpireCtx->db->id); + server.dirty++; + signalModifiedKey(NULL, &server.db[0], key); + decrRefCount(key); + } + return ACT_REMOVE_EXP_ITEM; + } else { + /* Hash has more fields to expire. Keep hash to pending items that will + * be added back to global HFE DS at the end of ebExpire() */ + ExpireMeta *expireMeta = hashGetExpireMeta(hashObj); + ebSetMetaExpTime(expireMeta, info.nextExpireTime); + return ACT_UPDATE_EXP_ITEM; + } +} + +/* Return the next/minimum expiry time of the hash-field. + * If not found, return EB_EXPIRE_TIME_INVALID */ +int64_t hashTypeGetMinExpire(robj *o) { + if (o->encoding == OBJ_ENCODING_LISTPACK) { + return EB_EXPIRE_TIME_INVALID; /* not supported yet */ + } + + serverAssert(o->encoding == OBJ_ENCODING_HT); + + dict *d = o->ptr; + if (!isDictWithMetaHFE(d)) + return EB_EXPIRE_TIME_INVALID; + + ExpireMeta *expireMeta = &((dictExpireMetadata *) dictMetadata(d))->expireMeta; + + /* Keep aside next hash-field expiry before updating HFE DS. Verify it is not trash */ + if (expireMeta->trash == 1) + return EB_EXPIRE_TIME_INVALID; + + return ebGetMetaExpTime(expireMeta); +} + +uint64_t hashTypeRemoveFromExpires(ebuckets *hexpires, robj *o) { + if (o->encoding == OBJ_ENCODING_LISTPACK) + return EB_EXPIRE_TIME_INVALID; /* not supported yet */ + + /* If dict doesn't holds HFE metadata */ + if (!isDictWithMetaHFE(o->ptr)) + return EB_EXPIRE_TIME_INVALID; + + uint64_t expireTime = ebGetExpireTime(&hashExpireBucketsType, o); + + /* If registered in global HFE DS then remove it (not trash) */ + if (expireTime != EB_EXPIRE_TIME_INVALID) + ebRemove(hexpires, &hashExpireBucketsType, o); + + return expireTime; +} + +/* Add hash to global HFE DS and update key for notifications. + * + * key - must be the same instance that is stored in db->dict + */ +void hashTypeAddToExpires(redisDb *db, sds key, robj *hashObj, uint64_t expireTime) { + if (expireTime == EB_EXPIRE_TIME_INVALID) + return; + + if (hashObj->encoding == OBJ_ENCODING_LISTPACK) { + return; /* TODO */ + } + serverAssert(hashObj->encoding == OBJ_ENCODING_HT); + + serverAssert(isDictWithMetaHFE(hashObj->ptr)); + + /* Update hash with key for notifications */ + dict *d = hashObj->ptr; + dictExpireMetadata *dictExpireMeta = (dictExpireMetadata *) dictMetadata(d); + dictExpireMeta->key = key; + + /* Add hash to global HFE DS */ + ebAdd(&db->hexpires, &hashExpireBucketsType, hashObj, expireTime); +} + +/* DB active expire and update hashes with time-expiration on fields. + * + * The callback function hashTypeActiveExpire() is invoked for each hash registered + * in the HFE DB (db->expires) with an expiration-time less than or equal to the + * current time. This callback performs the following actions for each hash: + * - If the hash has one or more fields to expire, it will delete those fields. + * - If there are more fields to expire, it will update the hash with the next + * expiration time in HFE DB. + * - If the hash has no more fields to expire, it is removed from the HFE DB. + * - If the hash has no more fields, it is removed from the main DB. + * + * Returns number of fields active-expired. + */ +uint64_t hashTypeDbActiveExpire(redisDb *db, uint32_t maxFieldsToExpire) { + ActiveExpireCtx ctx = { .db = db, .fieldsToExpireQuota = maxFieldsToExpire }; + ExpireInfo info = { + .maxToExpire = UINT64_MAX, /* Only maxFieldsToExpire play a role */ + .onExpireItem = hashTypeActiveExpire, + .ctx = &ctx, + .now = commandTimeSnapshot(), + .itemsExpired = 0}; + + ebExpire(&db->hexpires, &hashExpireBucketsType, &info); + + /* Return number of fields active-expired */ + return maxFieldsToExpire - ctx.fieldsToExpireQuota; +} /*----------------------------------------------------------------------------- * Hash type commands @@ -571,7 +1032,7 @@ void hsetnxCommand(client *c) { addReply(c, shared.czero); } else { hashTypeTryConversion(o,c->argv,2,3); - hashTypeSet(o,c->argv[2]->ptr,c->argv[3]->ptr,HASH_SET_COPY); + hashTypeSet(c->db, o,c->argv[2]->ptr,c->argv[3]->ptr,HASH_SET_COPY); addReply(c, shared.cone); signalModifiedKey(c,c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_HASH,"hset",c->argv[1],c->db->id); @@ -592,7 +1053,7 @@ void hsetCommand(client *c) { hashTypeTryConversion(o,c->argv,2,c->argc-1); for (i = 2; i < c->argc; i += 2) - created += !hashTypeSet(o,c->argv[i]->ptr,c->argv[i+1]->ptr,HASH_SET_COPY); + created += !hashTypeSet(c->db, o,c->argv[i]->ptr,c->argv[i+1]->ptr,HASH_SET_COPY); /* HMSET (deprecated) and HSET return value is different. */ char *cmdname = c->argv[0]->ptr; @@ -636,7 +1097,7 @@ void hincrbyCommand(client *c) { } value += incr; new = sdsfromlonglong(value); - hashTypeSet(o,c->argv[2]->ptr,new,HASH_SET_TAKE_VALUE); + hashTypeSet(c->db, o,c->argv[2]->ptr,new,HASH_SET_TAKE_VALUE); addReplyLongLong(c,value); signalModifiedKey(c,c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_HASH,"hincrby",c->argv[1],c->db->id); @@ -679,7 +1140,7 @@ void hincrbyfloatCommand(client *c) { char buf[MAX_LONG_DOUBLE_CHARS]; int len = ld2string(buf,sizeof(buf),value,LD_STR_HUMAN); new = sdsnewlen(buf,len); - hashTypeSet(o,c->argv[2]->ptr,new,HASH_SET_TAKE_VALUE); + hashTypeSet(c->db, o,c->argv[2]->ptr,new,HASH_SET_TAKE_VALUE); addReplyBulkCBuffer(c,buf,len); signalModifiedKey(c,c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_HASH,"hincrbyfloat",c->argv[1],c->db->id); @@ -750,7 +1211,7 @@ void hdelCommand(client *c) { for (j = 2; j < c->argc; j++) { if (hashTypeDelete(o,c->argv[j]->ptr)) { deleted++; - if (hashTypeLength(o) == 0) { + if (hashTypeLength(o, 0) == 0) { dbDelete(c->db,c->argv[1]); keyremoved = 1; break; @@ -774,7 +1235,7 @@ void hlenCommand(client *c) { if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || checkType(c,o,OBJ_HASH)) return; - addReplyLongLong(c,hashTypeLength(o)); + addReplyLongLong(c,hashTypeLength(o, 0)); } void hstrlenCommand(client *c) { @@ -797,8 +1258,10 @@ static void addHashIteratorCursorToReply(client *c, hashTypeIterator *hi, int wh else addReplyBulkLongLong(c, vll); } else if (hi->encoding == OBJ_ENCODING_HT) { - sds value = hashTypeCurrentFromHashTable(hi, what); - addReplyBulkCBuffer(c, value, sdslen(value)); + char *value; + size_t len; + hashTypeCurrentFromHashTable(hi, what, &value, &len, NULL); + addReplyBulkCBuffer(c, value, len); } else { serverPanic("Unknown hash encoding"); } @@ -816,7 +1279,7 @@ void genericHgetallCommand(client *c, int flags) { /* We return a map if the user requested keys and values, like in the * HGETALL case. Otherwise to use a flat array makes more sense. */ - length = hashTypeLength(o); + length = hashTypeLength(o, 1 /*subtractExpiredFields*/); if (flags & OBJ_HASH_KEY && flags & OBJ_HASH_VALUE) { addReplyMapLen(c, length); } else { @@ -824,7 +1287,12 @@ void genericHgetallCommand(client *c, int flags) { } hi = hashTypeInitIterator(o); - while (hashTypeNext(hi) != C_ERR) { + + /* Skip expired fields if the hash has an expire time set at global HFE DS. We could + * set it to constant 1, but then it will make another lookup for each field expiration */ + int skipExpiredFields = (EB_EXPIRE_TIME_INVALID == hashTypeGetMinExpire(o)) ? 0 : 1; + + while (hashTypeNext(hi, skipExpiredFields) != C_ERR) { if (flags & OBJ_HASH_KEY) { addHashIteratorCursorToReply(c, hi, OBJ_HASH_KEY); count++; @@ -869,6 +1337,7 @@ void hscanCommand(client *c) { if (parseScanCursorOrReply(c,c->argv[2],&cursor) == C_ERR) return; if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptyscan)) == NULL || checkType(c,o,OBJ_HASH)) return; + scanGenericCommand(c,o,cursor); } @@ -906,7 +1375,8 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { if ((hash = lookupKeyReadOrReply(c,c->argv[1],shared.emptyarray)) == NULL || checkType(c,hash,OBJ_HASH)) return; - size = hashTypeLength(hash); + /* TODO: Active-expire */ + size = hashTypeLength(hash, 0); if(l >= 0) { count = (unsigned long) l; @@ -932,14 +1402,13 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { else addReplyArrayLen(c, count); if (hash->encoding == OBJ_ENCODING_HT) { - sds key, value; while (count--) { dictEntry *de = dictGetFairRandomKey(hash->ptr); - key = dictGetKey(de); - value = dictGetVal(de); + hfield field = dictGetKey(de); + sds value = dictGetVal(de); if (withvalues && c->resp > 2) addReplyArrayLen(c,2); - addReplyBulkCBuffer(c, key, sdslen(key)); + addReplyBulkCBuffer(c, field, hfieldlen(field)); if (withvalues) addReplyBulkCBuffer(c, value, sdslen(value)); if (c->flags & CLIENT_CLOSE_ASAP) @@ -979,7 +1448,7 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { * elements inside the hash: simply return the whole hash. */ if(count >= size) { hashTypeIterator *hi = hashTypeInitIterator(hash); - while (hashTypeNext(hi) != C_ERR) { + while (hashTypeNext(hi, 0) != C_ERR) { if (withvalues && c->resp > 2) addReplyArrayLen(c,2); addHashIteratorCursorToReply(c, hi, OBJ_HASH_KEY); @@ -1021,12 +1490,12 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { * used into CASE 4 is highly inefficient. */ if (count*HRANDFIELD_SUB_STRATEGY_MUL > size) { /* Hashtable encoding (generic implementation) */ - dict *d = dictCreate(&sdsReplyDictType); + dict *d = dictCreate(&sdsReplyDictType); /* without metadata! */ dictExpand(d, size); hashTypeIterator *hi = hashTypeInitIterator(hash); /* Add all the elements into the temporary dictionary. */ - while ((hashTypeNext(hi)) != C_ERR) { + while ((hashTypeNext(hi, 0)) != C_ERR) { int ret = DICT_ERR; sds key, value = NULL; @@ -1044,7 +1513,9 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { while (size > count) { dictEntry *de; de = dictGetFairRandomKey(d); + dictUseStoredKeyApi(d, 1); dictUnlink(d,dictGetKey(de)); + dictUseStoredKeyApi(d, 0); sdsfree(dictGetKey(de)); sdsfree(dictGetVal(de)); dictFreeUnlinkedEntry(d,de); @@ -1134,6 +1605,510 @@ void hrandfieldCommand(client *c) { return; } - hashTypeRandomElement(hash,hashTypeLength(hash),&ele,NULL); + hashTypeRandomElement(hash,hashTypeLength(hash, 0),&ele,NULL); hashReplyFromListpackEntry(c, &ele); } + +/*----------------------------------------------------------------------------- + * Hash Field with optional expiry (based on mstr) + *----------------------------------------------------------------------------*/ +static hfield _hfieldNew(const void *field, size_t fieldlen, int withExpireMeta, + int trymalloc) +{ + if (!withExpireMeta) + return mstrNew(field, fieldlen, trymalloc); + + hfield hf = mstrNewWithMeta(&mstrFieldKind, field, fieldlen, + (mstrFlags) 1 << HFIELD_META_EXPIRE, trymalloc); + + ExpireMeta *expireMeta = mstrMetaRef(hf, &mstrFieldKind, HFIELD_META_EXPIRE); + + /* as long as it is not inside ebuckets, it is considered trash */ + expireMeta->trash = 1; + return hf; +} + +/* if expireAt is 0, then expireAt is ignored and no metadata is attached */ +hfield hfieldNew(const void *field, size_t fieldlen, int withExpireMeta) { + return _hfieldNew(field, fieldlen, withExpireMeta, 0); +} + +hfield hfieldTryNew(const void *field, size_t fieldlen, int withExpireMeta) { + return _hfieldNew(field, fieldlen, withExpireMeta, 1); +} + +int hfieldIsExpireAttached(hfield field) { + return mstrIsMetaAttached(field) && mstrGetFlag(field, (int) HFIELD_META_EXPIRE); +} + +static ExpireMeta* hfieldGetExpireMeta(const eItem field) { + /* extract the expireMeta from the field of type mstr */ + return mstrMetaRef(field, &mstrFieldKind, (int) HFIELD_META_EXPIRE); +} + +static uint64_t hfieldGetExpireTime(hfield field) { + if (!hfieldIsExpireAttached(field)) + return EB_EXPIRE_TIME_INVALID; + + ExpireMeta *expireMeta = mstrMetaRef(field, &mstrFieldKind, (int) HFIELD_META_EXPIRE); + if (expireMeta->trash) + return EB_EXPIRE_TIME_INVALID; + + return ebGetMetaExpTime(expireMeta); +} + +/* Remove TTL from the field. Assumed ExpireMeta is attached and has valid value */ +static void hfieldPersist(redisDb *db, robj *hashObj, hfield field) { + uint64_t fieldExpireTime = hfieldGetExpireTime(field); + if (fieldExpireTime == EB_EXPIRE_TIME_INVALID) + return; + + serverAssert(isDictWithMetaHFE(hashObj->ptr)); + + dict *d = hashObj->ptr; + dictExpireMetadata *dictExpireMeta = (dictExpireMetadata *)dictMetadata(d); + + /* If field has valid expiry then dict should have valid metadata as well */ + serverAssert(dictExpireMeta->expireMeta.trash == 0); + + uint64_t minExpire = ebGetMetaExpTime(&dictExpireMeta->expireMeta); + + /* Remove field from private HFE DS */ + ebRemove(&dictExpireMeta->hfe, &hashFieldExpireBucketsType, field); + + /* If the removed field was not the minimal to expire, then no need to update + * the hash at global HFE DS. Take into account precision loss in case + * EB_BUCKET_KEY_PRECISION>0 by assisting EB_BUCKET_KEY() */ + if (EB_BUCKET_KEY(minExpire) != EB_BUCKET_KEY(fieldExpireTime)) return; + + uint64_t newMinExpire = ebGetNextTimeToExpire(dictExpireMeta->hfe, &hashFieldExpireBucketsType); + + /* Calculate the diff between minExpire and newMinExpire. If it is + * only few seconds, then don't have to update global HFE DS. At the worst + * case fields of hash will be active-expired up to few seconds later. + * + * In any case, active-expire operation will know to update global + * HFE DS more efficiently than here for a single item. + */ + uint64_t diff = (minExpire > newMinExpire) ? + (minExpire - newMinExpire) : (newMinExpire - minExpire); + if (diff < HASH_NEW_EXPIRE_DIFF_THRESHOLD) return; + + ebRemove(&db->hexpires, &hashExpireBucketsType, hashObj); + + /* If it was not last field to expire */ + if (newMinExpire != EB_EXPIRE_TIME_INVALID) + ebAdd(&db->hexpires, &hashExpireBucketsType, hashObj, newMinExpire); +} + +int hfieldIsExpired(hfield field) { + /* Condition remains valid even if hfieldGetExpireTime() returns EB_EXPIRE_TIME_INVALID, + * as the constant is equivalent to (EB_EXPIRE_TIME_MAX + 1). */ + return ( (mstime_t)hfieldGetExpireTime(field) < commandTimeSnapshot()); +} + +/*----------------------------------------------------------------------------- + * Hash Field Expiration (HFE) + *----------------------------------------------------------------------------*/ +/* Called during active expiration of hash-fields */ +static ExpireAction onFieldExpire(eItem item, void *ctx) { + hfield hf = item; + robj *hashobj = (robj *) ctx; + dictUseStoredKeyApi((dict *)hashobj->ptr, 1); + hashTypeDelete(hashobj, hf); + server.stat_expired_hash_fields++; + dictUseStoredKeyApi((dict *)hashobj->ptr, 0); + return ACT_REMOVE_EXP_ITEM; +} + +/* Retrieve the ExpireMeta associated with the hash. + * The caller is responsible for ensuring that it is indeed attached. */ +static ExpireMeta *hashGetExpireMeta(const eItem item) { + robj *hashObj = (robj *)item; + dict *d = hashObj->ptr; + dictExpireMetadata *dictExpireMeta = (dictExpireMetadata *) dictMetadata(d); + return &dictExpireMeta->expireMeta; +} + +/* Set time-expiration to hash-field */ +SetExpireTimeRes hashTypeSetExpire(ebuckets *eb, + robj *hashObj, + sds field, + uint64_t expireAt, + int flag, + uint64_t *minPrevExp) +{ + dict *d = hashObj->ptr; + uint64_t prevExpire = EB_EXPIRE_TIME_MAX; + + /* First retrieve the field to check if it exists */ + dictEntry *de = dictFind(d, field); + if (de == NULL) return HFE_SET_NO_FIELD; + + hfield hf = dictGetKey(de); + + /* If field doesn't have expiry metadata attached */ + if (!hfieldIsExpireAttached(hf)) { + if (flag & (HFE_XX | HFE_LT | HFE_GT)) + return HFE_SET_NO_CONDITION_MET; + + /* allocate new field with expire metadata */ + hfield hfNew = hfieldNew(hf, hfieldlen(hf), 1 /*withExpireMeta*/); + /* Replace the old field with the new one with metadata */ + dictSetKey(d, de, hfNew); + hfieldFree(hf); + hf = hfNew; + } else { + /* read previous expire time */ + prevExpire = hfieldGetExpireTime(hf); + + if (prevExpire != EB_EXPIRE_TIME_INVALID) { + if (((flag == HFE_GT) && (prevExpire >= expireAt)) || + ((flag == HFE_LT) && (prevExpire <= expireAt)) || + (flag == HFE_NX) ) + return HFE_SET_NO_CONDITION_MET; + + ebRemove(eb, &hashFieldExpireBucketsType, hf); + + if (*minPrevExp > prevExpire) + *minPrevExp = prevExpire; + } else { + if (flag & (HFE_XX | HFE_LT | HFE_GT)) + return HFE_SET_NO_CONDITION_MET; + } + } + + /* if expiration time is in the past */ + if (checkAlreadyExpired(expireAt)) { + hashTypeDelete(hashObj, field); + return HFE_SET_DELETED; + } + ebAdd(eb, &hashFieldExpireBucketsType, hf, expireAt); + + // TODO: propagate, rewrite command if needed. See expireGenericCommand() as reference + + return HFE_SET_OK; +} + +static void httlGenericCommand(client *c, const char *cmd, long long basetime, int unit) { + UNUSED(cmd); + robj *hashObj; + long numFields = 0, numFieldsAt = 2; + + /* Read the hash object */ + if ((hashObj = lookupKeyReadOrReply(c, c->argv[1], shared.null[c->resp])) == NULL || + checkType(c, hashObj, OBJ_HASH)) return; + + /* not supported yet listpack */ + if (hashObj->encoding == OBJ_ENCODING_LISTPACK) { + addReplyError(c,"Hash field expire for listpack not supported yet."); + return; + } + + dict *d = hashObj->ptr; + + /* Read number of fields */ + if (getRangeLongFromObjectOrReply(c, c->argv[numFieldsAt], 1, LONG_MAX, + &numFields, "Parameter `numFileds` should be greater than 0") != C_OK) + return; + + /* Verify `numFields` is consistent with number of arguments */ + if (numFields > (c->argc - numFieldsAt - 1)) { + addReplyError(c, "Parameter `numFileds` is more than number of arguments"); + return; + } + + addReplyArrayLen(c, numFields); + for (int i = 0 ; i < numFields ; i++) { + sds field = c->argv[3+i]->ptr; + dictEntry *de = dictFind(d, field); + if (de == NULL) { + addReplyLongLong(c, HFE_GET_NO_FIELD); + continue; + } + + hfield hf = dictGetKey(de); + uint64_t expire = hfieldGetExpireTime(hf); + if (expire == EB_EXPIRE_TIME_INVALID) { + addReplyLongLong(c, HFE_GET_NO_TTL); /* no ttl */ + continue; + } + + if ( (long long) expire <= commandTimeSnapshot()) { + addReplyLongLong(c, HFE_GET_NO_FIELD); + continue; + } + + if (unit == UNIT_SECONDS) + addReplyLongLong(c, (expire + 999 - basetime) / 1000); + else + addReplyLongLong(c, (expire - basetime)); + } +} + +/* This is the generic command implementation for HEXPIRE, HPEXPIRE, HEXPIREAT + * and HPEXPIREAT. Because the command second argument may be relative or absolute + * the "basetime" argument is used to signal what the base time is (either 0 + * for *AT variants of the command, or the current time for relative expires). + * + * unit is either UNIT_SECONDS or UNIT_MILLISECONDS, and is only used for + * the argv[2] parameter. The basetime is always specified in milliseconds. + * + * Additional flags are supported and parsed via parseExtendedExpireArguments */ +static void hexpireGenericCommand(client *c, const char *cmd, long long basetime, int unit) { + long numFields = 0, numFieldsAt = 3; + long long expire; /* unix time in msec */ + int flag = 0; + robj *hashObj, *keyArg = c->argv[1], *expireArg = c->argv[2]; + + /* Read the hash object */ + if ((hashObj = lookupKeyWriteOrReply(c, keyArg, shared.null[c->resp])) == NULL || + checkType(c, hashObj, OBJ_HASH)) return; + + /* not supported yet listpack */ + if (hashObj->encoding == OBJ_ENCODING_LISTPACK) { + addReplyError(c,"Hash field expire for listpack not supported yet."); + return; + } + + dict *d = hashObj->ptr; + + /* Read the expiry time from command */ + if (getLongLongFromObjectOrReply(c, expireArg, &expire, NULL) != C_OK) + return; + + /* Check expire overflow */ + if (expire > (long long) EB_EXPIRE_TIME_MAX) { + addReplyErrorExpireTime(c); + return; + } + + if (unit == UNIT_SECONDS) { + if (expire > (long long) EB_EXPIRE_TIME_MAX / 1000) { + addReplyErrorExpireTime(c); + return; + } + expire *= 1000; + } else { + if (expire > (long long) EB_EXPIRE_TIME_MAX) { + addReplyErrorExpireTime(c); + return; + } + } + + if (expire > (long long) EB_EXPIRE_TIME_MAX - basetime) { + addReplyErrorExpireTime(c); + return; + } + expire += basetime; + + /* Read optional flag [NX|XX|GT|LT] */ + char *optArg = c->argv[3]->ptr; + if (!strcasecmp(optArg, "nx")) { + flag = HFE_NX; ++numFieldsAt; + } else if (!strcasecmp(optArg, "xx")) { + flag = HFE_XX; ++numFieldsAt; + } else if (!strcasecmp(optArg, "gt")) { + flag = HFE_GT; ++numFieldsAt; + } else if (!strcasecmp(optArg, "lt")) { + flag = HFE_LT; ++numFieldsAt; + } + + /* Read number of fields */ + if (getRangeLongFromObjectOrReply(c, c->argv[numFieldsAt], 1, LONG_MAX, + &numFields, "Parameter `numFields` should be greater than 0") != C_OK) + return; + + /* Verify `numFields` is consistent with number of arguments */ + if (numFields > (c->argc - numFieldsAt - 1)) { + addReplyError(c, "Parameter `numFileds` is more than number of arguments"); + return; + } + + dictExpireMetadata *dictExpireMeta; + uint64_t minExpire = EB_EXPIRE_TIME_INVALID; + + /* If dict doesn't have metadata attached */ + if (!isDictWithMetaHFE(d)) { + /* Realloc (only header of dict) with metadata for hash-field expiration */ + dictTypeAddMeta(&d, &mstrHashDictTypeWithHFE); + dictExpireMeta = (dictExpireMetadata *) dictMetadata(d); + hashObj->ptr = d; + + /* Find the key in the keyspace. Need to keep reference to the key for + * notifications or even removal of the hash */ + dictEntry *de = dbFind(c->db, keyArg->ptr); + serverAssert(de != NULL); + sds key = dictGetKey(de); + + /* Fillup dict HFE metadata */ + dictExpireMeta->key = key; /* reference key in keyspace */ + dictExpireMeta->hfe = ebCreate(); /* Allocate HFE DS */ + dictExpireMeta->expireMeta.trash = 1; /* mark as trash (as long it wasn't ebAdd()) */ + } else { + dictExpireMeta = (dictExpireMetadata *) dictMetadata(d); + ExpireMeta *expireMeta = &dictExpireMeta->expireMeta; + + /* Keep aside next hash-field expiry before updating HFE DS. Verify it is not trash */ + if (expireMeta->trash == 0) + minExpire = ebGetMetaExpTime(&dictExpireMeta->expireMeta); + } + + /* Figure out from provided set of fields in command, which one has the minimum + * expiration time, before the modification (Will be used for optimization below) */ + uint64_t minExpireFields = EB_EXPIRE_TIME_INVALID; + + /* For each field in command, update dict HFE DS */ + int fieldUpdated=0, fieldDeleted=0; + addReplyArrayLen(c, numFields); + for (int i = 0 ; i < numFields ; i++) { + sds field = c->argv[numFieldsAt+i+1]->ptr; + + SetExpireTimeRes res = hashTypeSetExpire(&dictExpireMeta->hfe, + hashObj, + field, + expire, + flag, + &minExpireFields); + addReplyLongLong(c,res); + if (res == HFE_SET_DELETED) + ++fieldDeleted; + else if (res == HFE_SET_OK) + ++fieldUpdated; + } + + /* Notify keyspace event, update dirty count and update global HFE DS */ + if (fieldDeleted + fieldUpdated > 0) { + server.dirty += fieldDeleted + fieldUpdated; + signalModifiedKey(c,c->db,keyArg); + notifyKeyspaceEvent(NOTIFY_HASH,cmd,keyArg,c->db->id); + if (fieldDeleted && hashTypeLength(hashObj, 0) == 0) { + dbDelete(c->db,keyArg); + notifyKeyspaceEvent(NOTIFY_GENERIC,"del",keyArg, c->db->id); + } else { + + /* If minimum HFE of the hash is smaller than expiration time of the + * specified fields in the command as well as it is smaller or equal + * than expiration time provided in the command, then the minimum + * HFE of the hash won't change following this command. */ + if ((minExpire < minExpireFields) && ((long long)minExpire <= expire) ) + return; + + /* retrieve new expired time. It might have changed. */ + uint64_t newMinExpire = ebGetNextTimeToExpire(dictExpireMeta->hfe, + &hashFieldExpireBucketsType); + + /* Calculate the diff between old minExpire and newMinExpire. If it is + * only few seconds, then don't have to update global HFE DS. At the worst + * case fields of hash will be active-expired up to few seconds later. + * + * In any case, active-expire operation will know to update global + * HFE DS more efficiently than here for a single item. + */ + uint64_t diff = (minExpire > newMinExpire) ? + (minExpire - newMinExpire) : (newMinExpire - minExpire); + if (diff < HASH_NEW_EXPIRE_DIFF_THRESHOLD) return; + + if (minExpire != EB_EXPIRE_TIME_INVALID) + ebRemove(&c->db->hexpires, &hashExpireBucketsType, hashObj); + if (newMinExpire != EB_EXPIRE_TIME_INVALID) + ebAdd(&c->db->hexpires, &hashExpireBucketsType, hashObj, newMinExpire); + } + } +} + +/* HPEXPIRE key milliseconds [ NX | XX | GT | LT] numfields */ +void hpexpireCommand(client *c) { + hexpireGenericCommand(c,"hpexpire", commandTimeSnapshot(),UNIT_MILLISECONDS); +} + +/* HEXPIRE key seconds [NX | XX | GT | LT] numfields */ +void hexpireCommand(client *c) { + hexpireGenericCommand(c,"hexpire", commandTimeSnapshot(),UNIT_SECONDS); +} + +/* HEXPIREAT key unix-time-seconds [NX | XX | GT | LT] numfields */ +void hexpireatCommand(client *c) { + hexpireGenericCommand(c,"hexpireat", 0,UNIT_SECONDS); +} + +/* HPEXPIREAT key unix-time-milliseconds [NX | XX | GT | LT] numfields */ +void hpexpireatCommand(client *c) { + hexpireGenericCommand(c,"hpexpireat", 0,UNIT_MILLISECONDS); +} + +/* for each specified field: get the remaining time to live in seconds*/ +/* HTTL key numfields */ +void httlCommand(client *c) { + httlGenericCommand(c, "httl", commandTimeSnapshot(), UNIT_SECONDS); +} + +/* HPTTL key numfields */ +void hpttlCommand(client *c) { + httlGenericCommand(c, "hpttl", commandTimeSnapshot(), UNIT_MILLISECONDS); +} + +/* HEXPIRETIME key numFields */ +void hexpiretimeCommand(client *c) { + httlGenericCommand(c, "hexpiretime", 0, UNIT_SECONDS); +} + +/* HPEXPIRETIME key numFields */ +void hpexpiretimeCommand(client *c) { + httlGenericCommand(c, "hexpiretime", 0, UNIT_MILLISECONDS); +} + +/* HPERSIST key */ +void hpersistCommand(client *c) { + robj *hashObj; + long numFields = 0, numFieldsAt = 2; + + /* Read the hash object */ + if ((hashObj = lookupKeyReadOrReply(c, c->argv[1], shared.null[c->resp])) == NULL || + checkType(c, hashObj, OBJ_HASH)) return; + + /* not supported yet listpack */ + if (hashObj->encoding == OBJ_ENCODING_LISTPACK) { + addReplyError(c,"Hash field expire for listpack not supported yet."); + return; + } + + dict *d = hashObj->ptr; + + /* Read number of fields */ + if (getRangeLongFromObjectOrReply(c, c->argv[numFieldsAt], 1, LONG_MAX, + &numFields, "Parameter `numFileds` should be greater than 0") != C_OK) + return; + + /* Verify `numFields` is consistent with number of arguments */ + if (numFields > (c->argc - numFieldsAt - 1)) { + addReplyError(c, "Parameter `numFileds` is more than number of arguments"); + return; + } + + addReplyArrayLen(c, numFields); + for (int i = 0 ; i < numFields ; i++) { + sds field = c->argv[3+i]->ptr; + dictEntry *de = dictFind(d, field); + if (de == NULL) { + addReplyLongLong(c, HFE_PERSIST_NO_FIELD); + continue; + } + + hfield hf = dictGetKey(de); + uint64_t expire = hfieldGetExpireTime(hf); + if (expire == EB_EXPIRE_TIME_INVALID) { + addReplyLongLong(c, HFE_PERSIST_NO_TTL); + continue; + } + + /* Already expired. Pretend there is no such field */ + if ( (long long) expire <= commandTimeSnapshot()) { + addReplyLongLong(c, HFE_PERSIST_NO_FIELD); + continue; + } + + hfieldPersist(c->db, hashObj, hf); + addReplyLongLong(c, HFE_PERSIST_OK); + } +} diff --git a/src/testhelp.h b/src/testhelp.h index 26c55d3f6..3304ccfee 100644 --- a/src/testhelp.h +++ b/src/testhelp.h @@ -21,6 +21,8 @@ #define REDIS_TEST_ACCURATE (1<<0) #define REDIS_TEST_LARGE_MEMORY (1<<1) #define REDIS_TEST_VALGRIND (1<<2) +#define REDIS_TEST_VERBOSE (1<<3) + extern int __failed_tests; extern int __test_num; diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index c33ec7347..14fa6cd26 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -34,6 +34,7 @@ set ::all_tests { unit/type/set unit/type/zset unit/type/hash + unit/type/hash-field-expire unit/type/stream unit/type/stream-cgroups unit/sort diff --git a/tests/unit/type/hash-field-expire.tcl b/tests/unit/type/hash-field-expire.tcl new file mode 100644 index 000000000..21efde16d --- /dev/null +++ b/tests/unit/type/hash-field-expire.tcl @@ -0,0 +1,616 @@ +######## HEXPIRE family commands +# Field does not exists +set E_NO_FIELD -2 +# Specified NX | XX | GT | LT condition not met +set E_FAIL 0 +# expiration time set/updated +set E_OK 1 +# Field deleted because the specified expiration time is in the past +set E_DELETED 2 + +######## HTTL family commands +set T_NO_FIELD -2 +set T_NO_EXPIRY -1 + +######## HPERIST +set P_NO_FIELD -2 +set P_NO_EXPIRY -1 +set P_OK 1 + +############################### AUX FUNCS ###################################### + +proc create_hash {key entries} { + r del $key + foreach entry $entries { + r hset $key [lindex $entry 0] [lindex $entry 1] + } +} + +proc get_keys {l} { + set res {} + foreach entry $l { + set key [lindex $entry 0] + lappend res $key + } + return $res +} + +proc cmp_hrandfield_result {hash_name expected_result} { + # Accumulate hrandfield results + unset -nocomplain myhash + array set myhash {} + for {set i 0} {$i < 100} {incr i} { + set key [r hrandfield $hash_name] + set myhash($key) 1 + } + set res [lsort [array names myhash]] + if {$res eq $expected_result} { + return 1 + } else { + return $res + } +} + +proc hrandfieldTest {activeExpireConfig} { + r debug set-active-expire $activeExpireConfig + r del myhash + set contents {{field1 1} {field2 2} } + create_hash myhash $contents + + set factorValgrind [expr {$::valgrind ? 2 : 1}] + + # Set expiration time for field1 and field2 such that field1 expires first + r hpexpire myhash 1 NX 1 field1 + r hpexpire myhash 100 NX 1 field2 + + # On call hrandfield command lazy expire deletes field1 first + wait_for_condition 8 10 { + [cmp_hrandfield_result myhash "field2"] == 1 + } else { + fail "Expected field2 to be returned by HRANDFIELD." + } + + # On call hrandfield command lazy expire deletes field2 as well + wait_for_condition 8 20 { + [cmp_hrandfield_result myhash "{}"] == 1 + } else { + fail "Expected {} to be returned by HRANDFIELD." + } + + # restore the default value + r debug set-active-expire 1 +} + +############################### TESTS ######################################### + +start_server {tags {"external:skip needs:debug"}} { + + # Currently listpack doesn't support HFE + r config set hash-max-listpack-entries 0 + + test {HPEXPIRE - Test 'NX' flag} { + r del myhash + r hset myhash field1 value1 field2 value2 field3 value3 + assert_equal [r hpexpire myhash 1000 NX 1 field1] [list $E_OK] + assert_equal [r hpexpire myhash 1000 NX 2 field1 field2] [list $E_FAIL $E_OK] + } + + test {HPEXPIRE - Test 'XX' flag} { + r del myhash + r hset myhash field1 value1 field2 value2 field3 value3 + assert_equal [r hpexpire myhash 1000 NX 2 field1 field2] [list $E_OK $E_OK] + assert_equal [r hpexpire myhash 1000 XX 2 field1 field3] [list $E_OK $E_FAIL] + } + + test {HPEXPIRE - Test 'GT' flag} { + r del myhash + r hset myhash field1 value1 field2 value2 + assert_equal [r hpexpire myhash 1000 NX 1 field1] [list $E_OK] + assert_equal [r hpexpire myhash 2000 NX 1 field2] [list $E_OK] + assert_equal [r hpexpire myhash 1500 GT 2 field1 field2] [list $E_OK $E_FAIL] + } + + test {HPEXPIRE - Test 'LT' flag} { + r del myhash + r hset myhash field1 value1 field2 value2 + assert_equal [r hpexpire myhash 1000 NX 1 field1] [list $E_OK] + assert_equal [r hpexpire myhash 2000 NX 1 field2] [list $E_OK] + assert_equal [r hpexpire myhash 1500 LT 2 field1 field2] [list $E_FAIL $E_OK] + } + + test {HPEXPIREAT - field not exists or TTL is in the past} { + r del myhash + r hset myhash f1 v1 f2 v2 f4 v4 + r hexpire myhash 1000 NX 1 f4 + assert_equal [r hexpireat myhash [expr {[clock seconds] - 1}] NX 4 f1 f2 f3 f4] "$E_DELETED $E_DELETED $E_NO_FIELD $E_FAIL" + assert_equal [r hexists myhash field1] 0 + } + + test {HPEXPIRE - wrong number of arguments} { + r del myhash + r hset myhash f1 v1 + assert_error {*Parameter `numFields` should be greater than 0} {r hpexpire myhash 1000 NX 0 f1 f2 f3} + assert_error {*Parameter `numFileds` is more than number of arguments} {r hpexpire myhash 1000 NX 4 f1 f2 f3} + } + + test {HPEXPIRE - parameter expire-time near limit of 2^48} { + r del myhash + r hset myhash f1 v1 + # below & above + assert_equal [r hpexpire myhash [expr (1<<48) - [clock milliseconds] - 1000 ] 1 f1] [list $E_OK] + assert_error {*invalid expire time*} {r hpexpire myhash [expr (1<<48) - [clock milliseconds] + 100 ] 1 f1} + } + + test {Lazy - doesn't delete hash that all its fields got expired} { + r debug set-active-expire 0 + r flushall + + set hash_sizes {1 15 16 17 31 32 33 40} + foreach h $hash_sizes { + for {set i 1} {$i <= $h} {incr i} { + # random expiration time + r hset hrand$h f$i v$i + r hpexpire hrand$h [expr {50 + int(rand() * 50)}] 1 f$i + assert_equal 1 [r HEXISTS hrand$h f$i] + + # same expiration time + r hset same$h f$i v$i + r hpexpire same$h 100 1 f$i + assert_equal 1 [r HEXISTS same$h f$i] + + # same expiration time + r hset mix$h f$i v$i fieldWithoutExpire$i v$i + r hpexpire mix$h 100 1 f$i + assert_equal 1 [r HEXISTS mix$h f$i] + } + } + + after 150 + + # Verify that all fields got expired but keys wasn't lazy deleted + foreach h $hash_sizes { + for {set i 1} {$i <= $h} {incr i} { + assert_equal 0 [r HEXISTS mix$h f$i] + } + assert_equal 1 [r EXISTS hrand$h] + assert_equal 1 [r EXISTS same$h] + assert_equal [expr $h * 2] [r HLEN mix$h] + } + # Restore default + r debug set-active-expire 1 + } + + test {Active - deletes hash that all its fields got expired} { + r flushall + + set hash_sizes {1 15 16 17 31 32 33 40} + foreach h $hash_sizes { + for {set i 1} {$i <= $h} {incr i} { + # random expiration time + r hset hrand$h f$i v$i + r hpexpire hrand$h [expr {50 + int(rand() * 50)}] 1 f$i + assert_equal 1 [r HEXISTS hrand$h f$i] + + # same expiration time + r hset same$h f$i v$i + r hpexpire same$h 100 1 f$i + assert_equal 1 [r HEXISTS same$h f$i] + + # same expiration time + r hset mix$h f$i v$i fieldWithoutExpire$i v$i + r hpexpire mix$h 100 1 f$i + assert_equal 1 [r HEXISTS mix$h f$i] + } + } + + # Wait for active expire + wait_for_condition 50 20 { [r EXISTS same40] == 0 } else { fail "hash `same40` should be expired" } + + # Verify that all fields got expired and keys got deleted + foreach h $hash_sizes { + for {set i 1} {$i <= $h} {incr i} { + assert_equal 0 [r HEXISTS mix$h f$i] + } + assert_equal 0 [r EXISTS hrand$h] + assert_equal 0 [r EXISTS same$h] + assert_equal $h [r HLEN mix$h] + } + } + + test {HPEXPIRE - Flushall deletes all pending expired fields} { + r del myhash + r hset myhash field1 value1 field2 value2 + r hpexpire myhash 10000 NX 1 field1 + r hpexpire myhash 10000 NX 1 field2 + r flushall + r del myhash + r hset myhash field1 value1 field2 value2 + r hpexpire myhash 10000 NX 1 field1 + r hpexpire myhash 10000 NX 1 field2 + r flushall async + } + + test {HTTL/HPTTL - Input validation gets failed on nonexists field or field without expire} { + r del myhash + r HSET myhash field1 value1 field2 value2 + r HPEXPIRE myhash 1000 NX 1 field1 + + foreach cmd {HTTL HPTTL} { + assert_equal [r $cmd non_exists_key 1 f] {} + assert_equal [r $cmd myhash 2 field2 non_exists_field] "$T_NO_EXPIRY $T_NO_FIELD" + # Set numFields less than actual number of fields. Fine. + assert_equal [r $cmd myhash 1 non_exists_field1 non_exists_field2] "$T_NO_FIELD" + } + } + + test {HTTL/HPTTL - returns time to live in seconds/msillisec} { + r del myhash + r HSET myhash field1 value1 field2 value2 + r HPEXPIRE myhash 2000 NX 2 field1 field2 + set ttlArray [r HTTL myhash 2 field1 field2] + assert_range [lindex $ttlArray 0] 1 2 + set ttl [r HPTTL myhash 1 field1] + assert_range $ttl 1000 2000 + } + + test {HEXPIRETIME - returns TTL in Unix timestamp} { + r del myhash + r HSET myhash field1 value1 + r HPEXPIRE myhash 1000 NX 1 field1 + + set lo [expr {[clock seconds] + 1}] + set hi [expr {[clock seconds] + 2}] + assert_range [r HEXPIRETIME myhash 1 field1] $lo $hi + assert_range [r HPEXPIRETIME myhash 1 field1] [expr $lo*1000] [expr $hi*1000] + } + + test {HTTL/HPTTL - Verify TTL progress until expiration} { + r del myhash + r hset myhash field1 value1 field2 value2 + r hpexpire myhash 200 NX 1 field1 + assert_range [r HPTTL myhash 1 field1] 100 200 + assert_range [r HTTL myhash 1 field1] 0 1 + after 100 + assert_range [r HPTTL myhash 1 field1] 1 101 + after 110 + assert_equal [r HPTTL myhash 1 field1] $T_NO_FIELD + assert_equal [r HTTL myhash 1 field1] $T_NO_FIELD + } + + test {HPEXPIRE - DEL hash with non expired fields (valgrind test)} { + r del myhash + r hset myhash field1 value1 field2 value2 + r hpexpire myhash 10000 NX 1 field1 + r del myhash + } + + test {HEXPIREAT - Set time in the past} { + r del myhash + r hset myhash field1 value1 + assert_equal [r hexpireat myhash [expr {[clock seconds] - 1}] NX 1 field1] $E_DELETED + assert_equal [r hexists myhash field1] 0 + } + + test {HEXPIREAT - Set time and then get TTL} { + r del myhash + r hset myhash field1 value1 + + r hexpireat myhash [expr {[clock seconds] + 2}] NX 1 field1 + assert_range [r hpttl myhash 1 field1] 1000 2000 + assert_range [r httl myhash 1 field1] 1 2 + + r hexpireat myhash [expr {[clock seconds] + 5}] XX 1 field1 + assert_range [r httl myhash 1 field1] 4 5 + } + + test {Lazy expire - delete hash with expired fields} { + r del myhash + r debug set-active-expire 0 + r hset myhash k v + r hpexpire myhash 1 NX 1 k + after 5 + r del myhash + r debug set-active-expire 1 + } + +# OPEN: To decide if to delete expired fields at start of HRANDFIELD. +# test {Test HRANDFIELD does not return expired fields} { +# hrandfieldTest 0 +# hrandfieldTest 1 +# } + + test {Test HRANDFIELD can return expired fields} { + r debug set-active-expire 0 + r del myhash + r hset myhash f1 v1 f2 v2 f3 v3 f4 v4 f5 v5 + r hpexpire myhash 1 NX 4 f1 f2 f3 f4 + after 5 + set res [cmp_hrandfield_result myhash "f1 f2 f3 f4 f5"] + assert {$res == 1} + r debug set-active-expire 1 + + } + + test {Lazy expire - HLEN does count expired fields} { + # Enforce only lazy expire + r debug set-active-expire 0 + + r del h1 h4 h18 h20 + r hset h1 k1 v1 + r hpexpire h1 1 NX 1 k1 + + r hset h4 k1 v1 k2 v2 k3 v3 k4 v4 + r hpexpire h4 1 NX 3 k1 k3 k4 + + # beyond 16 fields: HFE DS (ebuckets) converts from list to rax + + r hset h18 k1 v1 k2 v2 k3 v3 k4 v4 k5 v5 k6 v6 k7 v7 k8 v8 k9 v9 k10 v10 k11 v11 k12 v12 k13 v13 k14 v14 k15 v15 k16 v16 k17 v17 k18 v18 + r hpexpire h18 1 NX 18 k1 k2 k3 k4 k5 k6 k7 k8 k9 k10 k11 k12 k13 k14 k15 k16 k17 k18 + + r hset h20 k1 v1 k2 v2 k3 v3 k4 v4 k5 v5 k6 v6 k7 v7 k8 v8 k9 v9 k10 v10 k11 v11 k12 v12 k13 v13 k14 v14 k15 v15 k16 v16 k17 v17 k18 v18 k19 v19 k20 v20 + r hpexpire h20 1 NX 2 k1 k2 + + after 10 + + assert_equal [r hlen h1] 1 + assert_equal [r hlen h4] 4 + assert_equal [r hlen h18] 18 + assert_equal [r hlen h20] 20 + # Restore to support active expire + r debug set-active-expire 1 + } + + test {Lazy expire - HSCAN does not report expired fields} { + # Enforce only lazy expire + r debug set-active-expire 0 + + r del h1 h20 h4 h18 h20 + r hset h1 01 01 + r hpexpire h1 1 NX 1 01 + + r hset h4 01 01 02 02 03 03 04 04 + r hpexpire h4 1 NX 3 01 03 04 + + # beyond 16 fields hash-field expiration DS (ebuckets) converts from list to rax + + r hset h18 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 + r hpexpire h18 1 NX 18 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 + + r hset h20 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 + r hpexpire h20 1 NX 2 01 02 + + after 10 + + # Verify SCAN does not report expired fields + assert_equal [lsort -unique [lindex [r hscan h1 0 COUNT 10] 1]] "" + assert_equal [lsort -unique [lindex [r hscan h4 0 COUNT 10] 1]] "02" + assert_equal [lsort -unique [lindex [r hscan h18 0 COUNT 10] 1]] "" + assert_equal [lsort -unique [lindex [r hscan h20 0 COUNT 100] 1]] "03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20" + # Restore to support active expire + r debug set-active-expire 1 + } + + test {Test HSCAN with mostly expired fields return empty result} { + r debug set-active-expire 0 + + # Create hash with 1000 fields and 999 of them will be expired + r del myhash + for {set i 1} {$i <= 1000} {incr i} { + r hset myhash field$i value$i + if {$i > 1} { + r hpexpire myhash 1 NX 1 field$i + } + } + after 3 + + # Verify iterative HSCAN returns either empty result or only the first field + set countEmptyResult 0 + set cur 0 + while 1 { + set res [r hscan myhash $cur] + set cur [lindex $res 0] + # if the result is not empty, it should contain only the first field + if {[llength [lindex $res 1]] > 0} { + assert_equal [lindex $res 1] "field1 value1" + } else { + incr countEmptyResult + } + if {$cur == 0} break + } + assert {$countEmptyResult > 0} + r debug set-active-expire 1 + } + + test {Lazy expire - verify various HASH commands handling expired fields} { + # Enforce only lazy expire + r debug set-active-expire 0 + r del h1 h2 h3 h4 h5 h18 + r hset h1 01 01 + r hset h2 01 01 02 02 + r hset h3 01 01 02 02 03 03 + r hset h4 1 99 2 99 3 99 4 99 + r hset h5 1 1 2 22 3 333 4 4444 5 55555 + r hset h6 01 01 02 02 03 03 04 04 05 05 06 06 + r hset h18 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 + r hpexpire h1 100 NX 1 01 + r hpexpire h2 100 NX 1 01 + r hpexpire h2 100 NX 1 02 + r hpexpire h3 100 NX 1 01 + r hpexpire h4 100 NX 1 2 + r hpexpire h5 100 NX 1 3 + r hpexpire h6 100 NX 1 05 + r hpexpire h18 100 NX 17 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 + + after 150 + + # Verify HDEL not ignore expired field. It is too much overhead to check + # if the field is expired before deletion. + assert_equal [r HDEL h1 01] "1" + + # Verify HGET ignore expired field + assert_equal [r HGET h2 01] "" + assert_equal [r HGET h2 02] "" + assert_equal [r HGET h3 01] "" + assert_equal [r HGET h3 02] "02" + assert_equal [r HGET h3 03] "03" + # Verify HINCRBY ignore expired field + assert_equal [r HINCRBY h4 2 1] "1" + assert_equal [r HINCRBY h4 3 1] "100" + # Verify HSTRLEN ignore expired field + assert_equal [r HSTRLEN h5 3] "0" + assert_equal [r HSTRLEN h5 4] "4" + assert_equal [lsort [r HKEYS h6]] "01 02 03 04 06" + # Verify HEXISTS ignore expired field + assert_equal [r HEXISTS h18 07] "0" + assert_equal [r HEXISTS h18 18] "1" + # Verify HVALS ignore expired field + assert_equal [lsort [r HVALS h18]] "18" + # Restore to support active expire + r debug set-active-expire 1 + } + + test {A field with TTL overridden with another value (TTL discarded)} { + r del myhash + r hset myhash field1 value1 + r hpexpire myhash 1 NX 1 field1 + r hset myhash field1 value2 + after 5 + # Expected TTL will be discarded + assert_equal [r hget myhash field1] "value2" + } + + test {Modify TTL of a field} { + r del myhash + r hset myhash field1 value1 + r hpexpire myhash 200 NX 1 field1 + r hpexpire myhash 1000 XX 1 field1 + after 15 + assert_equal [r hget myhash field1] "value1" + assert_range [r hpttl myhash 1 field1] 900 1000 + } + + test {Test HGETALL not return expired fields} { + # Test with small hash + r debug set-active-expire 0 + r del myhash + r hset myhash1 f1 v1 f2 v2 f3 v3 f4 v4 f5 v5 + r hpexpire myhash1 1 NX 2 f2 f4 + after 10 + assert_equal [lsort [r hgetall myhash1]] "f1 f3 f5 v1 v3 v5" + + # Test with large hash + r del myhash + for {set i 1} {$i <= 600} {incr i} { + r hset myhash f$i v$i + if {$i > 3} { r hpexpire myhash 1 NX 1 f$i } + } + after 10 + assert_equal [lsort [r hgetall myhash]] [lsort "f1 f2 f3 v1 v2 v3"] + r debug set-active-expire 1 + + } + + test {Test RENAME hash with fields to be expired} { + r debug set-active-expire 0 + r del myhash + r hset myhash field1 value1 + r hpexpire myhash 20 NX 1 field1 + r rename myhash myhash2 + assert_equal [r exists myhash] 0 + assert_range [r hpttl myhash2 1 field1] 1 20 + after 25 + # Verify the renamed key exists + assert_equal [r exists myhash2] 1 + r debug set-active-expire 1 + # Only active expire will delete the key + wait_for_condition 30 10 { [r exists myhash2] == 0 } else { fail "`myhash2` should be expired" } + } + + test {MOVE to another DB hash with fields to be expired} { + r select 9 + r flushall + r hset myhash field1 value1 + r hpexpire myhash 100 NX 1 field1 + r move myhash 10 + assert_equal [r exists myhash] 0 + assert_equal [r dbsize] 0 + + # Verify the key and its field exists in the target DB + r select 10 + assert_equal [r hget myhash field1] "value1" + assert_equal [r exists myhash] 1 + + # Eventually the field will be expired and the key will be deleted + wait_for_condition 40 10 { [r hget myhash field1] == "" } else { fail "`field1` should be expired" } + wait_for_condition 40 10 { [r exists myhash] == 0 } else { fail "db should be empty" } + } {} {singledb:skip} + + test {Test COPY hash with fields to be expired} { + r flushall + r hset h1 f1 v1 f2 v2 + r hset h2 f1 v1 f2 v2 f3 v3 f4 v4 f5 v5 f6 v6 f7 v7 f8 v8 f9 v9 f10 v10 f11 v11 f12 v12 f13 v13 f14 v14 f15 v15 f16 v16 f17 v17 f18 v18 + r hpexpire h1 100 NX 1 f1 + r hpexpire h2 100 NX 18 f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16 f17 f18 + r COPY h1 h1copy + r COPY h2 h2copy + assert_equal [r hget h1 f1] "v1" + assert_equal [r hget h1copy f1] "v1" + assert_equal [r exists h2] 1 + assert_equal [r exists h2copy] 1 + after 105 + + # Verify lazy expire of field in h1 and its copy + assert_equal [r hget h1 f1] "" + assert_equal [r hget h1copy f1] "" + + # Verify lazy expire of field in h2 and its copy. Verify the key deleted as well. + wait_for_condition 40 10 { [r exists h2] == 0 } else { fail "`h2` should be expired" } + wait_for_condition 40 10 { [r exists h2copy] == 0 } else { fail "`h2copy` should be expired" } + + } {} {singledb:skip} + + test {Test SWAPDB hash-fields to be expired} { + r select 9 + r flushall + r hset myhash field1 value1 + r hpexpire myhash 50 NX 1 field1 + + r swapdb 9 10 + + # Verify the key and its field doesn't exist in the source DB + assert_equal [r exists myhash] 0 + assert_equal [r dbsize] 0 + + # Verify the key and its field exists in the target DB + r select 10 + assert_equal [r hget myhash field1] "value1" + assert_equal [r dbsize] 1 + + # Eventually the field will be expired and the key will be deleted + wait_for_condition 20 10 { [r exists myhash] == 0 } else { fail "'myhash' should be expired" } + } {} {singledb:skip} + + test {HPERSIST - input validation} { + # HPERSIST key + r del myhash + r hset myhash f1 v1 f2 v2 + r hexpire myhash 1000 NX 1 f1 + assert_error {*wrong number of arguments*} {r hpersist myhash} + assert_error {*wrong number of arguments*} {r hpersist myhash 1} + assert_equal [r hpersist not-exists-key 1 f1] {} + assert_equal [r hpersist myhash 2 f1 not-exists-field] "$P_OK $P_NO_FIELD" + assert_equal [r hpersist myhash 1 f2] "$P_NO_EXPIRY" + } + + test {HPERSIST - verify fields with TTL are persisted} { + r del myhash + r hset myhash f1 v1 f2 v2 + r hexpire myhash 20 NX 2 f1 f2 + r hpersist myhash 2 f1 f2 + after 25 + assert_equal [r hget myhash f1] "v1" + assert_equal [r hget myhash f2] "v2" + assert_equal [r HTTL myhash 2 f1 f2] "$T_NO_EXPIRY $T_NO_EXPIRY" + } + r config set hash-max-listpack-entries 1 +}