Implement HLL-based APPROX mode for SUNIONCARD

Replace the TODO stub with a full HyperLogLog-based approximate
cardinality path, activated by the APPROX flag.

Extract struct hllhdr into a new hyperloglog.h header along with
declarations for createHLLObject, hllAdd, and hllCount, keeping all
HLL constants and internal functions private to hyperloglog.c.

The APPROX path reuses the existing HLL object API: createHLLObject
allocates a temporary HLL, hllAdd feeds each set element through the
standard sparse-to-dense pipeline, and hllCount estimates the final
cardinality. For LIMIT with APPROX, the estimate is checked every 1024
elements to enable early termination.

Add a large-set test (2x10K elements with overlap) verifying the
APPROX result is within 5% of exact and that APPROX+LIMIT capping
works correctly.
This commit is contained in:
Hristo Staykov 2026-03-12 20:12:36 +02:00
parent fd3b8744b4
commit 87a06033dd
4 changed files with 105 additions and 12 deletions

View file

@ -15,6 +15,7 @@
*/
#include "server.h"
#include "hyperloglog.h"
#include <stdint.h>
#include <math.h>
@ -178,13 +179,7 @@
* configured via the define server.hll_sparse_max_bytes.
*/
struct hllhdr {
char magic[4]; /* "HYLL" */
uint8_t encoding; /* HLL_DENSE or HLL_SPARSE. */
uint8_t notused[3]; /* Reserved for future use, must be zero. */
uint8_t card[8]; /* Cached cardinality, little endian. */
uint8_t registers[]; /* Data bytes. */
};
/* struct hllhdr and function declarations are in hyperloglog.h. */
/* The cached cardinality MSB is used to signal validity of the cached value. */
#define HLL_INVALIDATE_CACHE(hdr) (hdr)->card[7] |= (1<<7)

19
src/hyperloglog.h Normal file
View file

@ -0,0 +1,19 @@
#ifndef HYPERLOGLOG_H
#define HYPERLOGLOG_H
#include <stdint.h>
#include <stddef.h>
struct hllhdr {
char magic[4]; /* "HYLL" */
uint8_t encoding; /* HLL_DENSE or HLL_SPARSE. */
uint8_t notused[3]; /* Reserved for future use, must be zero. */
uint8_t card[8]; /* Cached cardinality, little endian. */
uint8_t registers[]; /* Data bytes. */
};
robj *createHLLObject(void);
int hllAdd(robj *o, unsigned char *ele, size_t elesize);
uint64_t hllCount(struct hllhdr *hdr, int *invalid);
#endif /* HYPERLOGLOG_H */

View file

@ -14,6 +14,7 @@
#include "server.h"
#include "intset.h" /* Compact integer set structure */
#include "hyperloglog.h"
/*-----------------------------------------------------------------------------
* Set Commands
@ -1840,6 +1841,8 @@ void sunionCommand(client *c) {
sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,SET_OP_UNION);
}
#define HLL_APPROX_CHECK_INTERVAL 1024
/* SUNIONCARD numkeys key [key ...] [APPROX] [LIMIT limit] */
void sunioncardCommand(client *c) {
long j;
@ -1879,11 +1882,6 @@ void sunioncardCommand(client *c) {
return;
}
if (approx) {
/* TODO: Implement HLL-based approximate union cardinality.
* For now, fall through to exact mode. */
}
setopsrc *sets = zmalloc(sizeof(setopsrc) * numkeys);
for (j = 0; j < numkeys; j++) {
kvobj *setobj = lookupKeyRead(c->db, c->argv[2 + j]);
@ -1901,6 +1899,65 @@ void sunioncardCommand(client *c) {
sets[j].oldsize = kvobjAllocSize(setobj);
}
if (approx) {
/* HLL-based approximate cardinality: use a temporary HLL object
* with the standard sparsedense encoding (same as PFADD). */
robj *hllobj = createHLLObject();
setTypeIterator si;
char *str;
size_t len = 0;
int64_t llval = 0;
int encoding;
long elements_processed = 0;
int early_exit = 0;
for (j = 0; j < numkeys && !early_exit; j++) {
if (!sets[j].set) continue;
setTypeInitIterator(&si, sets[j].set);
while ((encoding = setTypeNext(&si, &str, &len, &llval)) != -1) {
if (str != NULL) {
hllAdd(hllobj, (unsigned char *)str, len);
} else {
char buf[LONG_STR_SIZE];
size_t slen = ll2string(buf, sizeof(buf), (long long)llval);
hllAdd(hllobj, (unsigned char *)buf, slen);
}
elements_processed++;
if (have_limit &&
(elements_processed % HLL_APPROX_CHECK_INTERVAL == 0)) {
uint64_t est = hllCount(hllobj->ptr, NULL);
if (est >= (uint64_t)limit) {
early_exit = 1;
break;
}
}
}
setTypeResetIterator(&si);
}
uint64_t cardinality = hllCount(hllobj->ptr, NULL);
if (have_limit && cardinality > (uint64_t)limit)
cardinality = (uint64_t)limit;
if (server.memory_tracking_enabled) {
for (j = 0; j < numkeys; j++) {
robj *obj = sets[j].set;
if (!obj) continue;
updateSlotAllocSize(c->db, getKeySlot(c->argv[2 + j]->ptr), obj,
sets[j].oldsize, kvobjAllocSize(obj));
}
}
addReplyLongLong(c, (long long)cardinality);
decrRefCount(hllobj);
zfree(sets);
return;
}
/* Exact cardinality: build a temporary union set. */
int dstset_encoding = OBJ_ENCODING_INTSET;
for (j = 0; j < numkeys; j++) {
if (!sets[j].set) continue;

View file

@ -321,6 +321,28 @@ foreach type {single multiple single_multiple} {
assert_equal 0 [r sunioncard 2 set1{t} set2{t} APPROX LIMIT 0]
}
test "SUNIONCARD APPROX with large sets is within HLL error margin" {
r del bigset1{t} bigset2{t}
set n 10000
for {set i 0} {$i < $n} {incr i} {
r sadd bigset1{t} "elem_a_$i"
}
for {set i 5000} {$i < [expr {$n + 5000}]} {incr i} {
r sadd bigset2{t} "elem_b_$i"
}
set exact [r sunioncard 2 bigset1{t} bigset2{t}]
set approx_val [r sunioncard 2 bigset1{t} bigset2{t} APPROX]
set error_pct [expr {abs($approx_val - $exact) * 100.0 / $exact}]
assert {$error_pct < 5.0}
set approx_limited [r sunioncard 2 bigset1{t} bigset2{t} APPROX LIMIT 5000]
assert_equal 5000 $approx_limited
r del bigset1{t} bigset2{t}
}
foreach {type} {regular intset} {
# Create sets setN{t} where N = 1..5
if {$type eq "regular"} {