diff --git a/src/hyperloglog.c b/src/hyperloglog.c index a3e8dd180..54e05745d 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -15,6 +15,7 @@ */ #include "server.h" +#include "hyperloglog.h" #include #include @@ -178,13 +179,7 @@ * configured via the define server.hll_sparse_max_bytes. */ -struct hllhdr { - char magic[4]; /* "HYLL" */ - uint8_t encoding; /* HLL_DENSE or HLL_SPARSE. */ - uint8_t notused[3]; /* Reserved for future use, must be zero. */ - uint8_t card[8]; /* Cached cardinality, little endian. */ - uint8_t registers[]; /* Data bytes. */ -}; +/* struct hllhdr and function declarations are in hyperloglog.h. */ /* The cached cardinality MSB is used to signal validity of the cached value. */ #define HLL_INVALIDATE_CACHE(hdr) (hdr)->card[7] |= (1<<7) diff --git a/src/hyperloglog.h b/src/hyperloglog.h new file mode 100644 index 000000000..3817d27e4 --- /dev/null +++ b/src/hyperloglog.h @@ -0,0 +1,19 @@ +#ifndef HYPERLOGLOG_H +#define HYPERLOGLOG_H + +#include +#include + +struct hllhdr { + char magic[4]; /* "HYLL" */ + uint8_t encoding; /* HLL_DENSE or HLL_SPARSE. */ + uint8_t notused[3]; /* Reserved for future use, must be zero. */ + uint8_t card[8]; /* Cached cardinality, little endian. */ + uint8_t registers[]; /* Data bytes. */ +}; + +robj *createHLLObject(void); +int hllAdd(robj *o, unsigned char *ele, size_t elesize); +uint64_t hllCount(struct hllhdr *hdr, int *invalid); + +#endif /* HYPERLOGLOG_H */ diff --git a/src/t_set.c b/src/t_set.c index 24b92ce70..6863f870b 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -14,6 +14,7 @@ #include "server.h" #include "intset.h" /* Compact integer set structure */ +#include "hyperloglog.h" /*----------------------------------------------------------------------------- * Set Commands @@ -1840,6 +1841,8 @@ void sunionCommand(client *c) { sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,SET_OP_UNION); } +#define HLL_APPROX_CHECK_INTERVAL 1024 + /* SUNIONCARD numkeys key [key ...] [APPROX] [LIMIT limit] */ void sunioncardCommand(client *c) { long j; @@ -1879,11 +1882,6 @@ void sunioncardCommand(client *c) { return; } - if (approx) { - /* TODO: Implement HLL-based approximate union cardinality. - * For now, fall through to exact mode. */ - } - setopsrc *sets = zmalloc(sizeof(setopsrc) * numkeys); for (j = 0; j < numkeys; j++) { kvobj *setobj = lookupKeyRead(c->db, c->argv[2 + j]); @@ -1901,6 +1899,65 @@ void sunioncardCommand(client *c) { sets[j].oldsize = kvobjAllocSize(setobj); } + if (approx) { + /* HLL-based approximate cardinality: use a temporary HLL object + * with the standard sparse→dense encoding (same as PFADD). */ + robj *hllobj = createHLLObject(); + + setTypeIterator si; + char *str; + size_t len = 0; + int64_t llval = 0; + int encoding; + long elements_processed = 0; + int early_exit = 0; + + for (j = 0; j < numkeys && !early_exit; j++) { + if (!sets[j].set) continue; + + setTypeInitIterator(&si, sets[j].set); + while ((encoding = setTypeNext(&si, &str, &len, &llval)) != -1) { + if (str != NULL) { + hllAdd(hllobj, (unsigned char *)str, len); + } else { + char buf[LONG_STR_SIZE]; + size_t slen = ll2string(buf, sizeof(buf), (long long)llval); + hllAdd(hllobj, (unsigned char *)buf, slen); + } + + elements_processed++; + if (have_limit && + (elements_processed % HLL_APPROX_CHECK_INTERVAL == 0)) { + uint64_t est = hllCount(hllobj->ptr, NULL); + if (est >= (uint64_t)limit) { + early_exit = 1; + break; + } + } + } + setTypeResetIterator(&si); + } + + uint64_t cardinality = hllCount(hllobj->ptr, NULL); + if (have_limit && cardinality > (uint64_t)limit) + cardinality = (uint64_t)limit; + + if (server.memory_tracking_enabled) { + for (j = 0; j < numkeys; j++) { + robj *obj = sets[j].set; + if (!obj) continue; + updateSlotAllocSize(c->db, getKeySlot(c->argv[2 + j]->ptr), obj, + sets[j].oldsize, kvobjAllocSize(obj)); + } + } + + addReplyLongLong(c, (long long)cardinality); + decrRefCount(hllobj); + zfree(sets); + return; + } + + /* Exact cardinality: build a temporary union set. */ int dstset_encoding = OBJ_ENCODING_INTSET; for (j = 0; j < numkeys; j++) { if (!sets[j].set) continue; diff --git a/tests/unit/type/set.tcl b/tests/unit/type/set.tcl index cad86f3fb..8368cbaeb 100644 --- a/tests/unit/type/set.tcl +++ b/tests/unit/type/set.tcl @@ -321,6 +321,28 @@ foreach type {single multiple single_multiple} { assert_equal 0 [r sunioncard 2 set1{t} set2{t} APPROX LIMIT 0] } + test "SUNIONCARD APPROX with large sets is within HLL error margin" { + r del bigset1{t} bigset2{t} + set n 10000 + for {set i 0} {$i < $n} {incr i} { + r sadd bigset1{t} "elem_a_$i" + } + for {set i 5000} {$i < [expr {$n + 5000}]} {incr i} { + r sadd bigset2{t} "elem_b_$i" + } + + set exact [r sunioncard 2 bigset1{t} bigset2{t}] + set approx_val [r sunioncard 2 bigset1{t} bigset2{t} APPROX] + + set error_pct [expr {abs($approx_val - $exact) * 100.0 / $exact}] + assert {$error_pct < 5.0} + + set approx_limited [r sunioncard 2 bigset1{t} bigset2{t} APPROX LIMIT 5000] + assert_equal 5000 $approx_limited + + r del bigset1{t} bigset2{t} + } + foreach {type} {regular intset} { # Create sets setN{t} where N = 1..5 if {$type eq "regular"} {