mirror of
https://github.com/redis/redis.git
synced 2026-05-28 04:02:46 -04:00
Implement HLL-based APPROX mode for SUNIONCARD
Replace the TODO stub with a full HyperLogLog-based approximate cardinality path, activated by the APPROX flag. Extract struct hllhdr into a new hyperloglog.h header along with declarations for createHLLObject, hllAdd, and hllCount, keeping all HLL constants and internal functions private to hyperloglog.c. The APPROX path reuses the existing HLL object API: createHLLObject allocates a temporary HLL, hllAdd feeds each set element through the standard sparse-to-dense pipeline, and hllCount estimates the final cardinality. For LIMIT with APPROX, the estimate is checked every 1024 elements to enable early termination. Add a large-set test (2x10K elements with overlap) verifying the APPROX result is within 5% of exact and that APPROX+LIMIT capping works correctly.
This commit is contained in:
parent
fd3b8744b4
commit
87a06033dd
4 changed files with 105 additions and 12 deletions
|
|
@ -15,6 +15,7 @@
|
|||
*/
|
||||
|
||||
#include "server.h"
|
||||
#include "hyperloglog.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
|
|
@ -178,13 +179,7 @@
|
|||
* configured via the define server.hll_sparse_max_bytes.
|
||||
*/
|
||||
|
||||
struct hllhdr {
|
||||
char magic[4]; /* "HYLL" */
|
||||
uint8_t encoding; /* HLL_DENSE or HLL_SPARSE. */
|
||||
uint8_t notused[3]; /* Reserved for future use, must be zero. */
|
||||
uint8_t card[8]; /* Cached cardinality, little endian. */
|
||||
uint8_t registers[]; /* Data bytes. */
|
||||
};
|
||||
/* struct hllhdr and function declarations are in hyperloglog.h. */
|
||||
|
||||
/* The cached cardinality MSB is used to signal validity of the cached value. */
|
||||
#define HLL_INVALIDATE_CACHE(hdr) (hdr)->card[7] |= (1<<7)
|
||||
|
|
|
|||
19
src/hyperloglog.h
Normal file
19
src/hyperloglog.h
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
#ifndef HYPERLOGLOG_H
|
||||
#define HYPERLOGLOG_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
struct hllhdr {
|
||||
char magic[4]; /* "HYLL" */
|
||||
uint8_t encoding; /* HLL_DENSE or HLL_SPARSE. */
|
||||
uint8_t notused[3]; /* Reserved for future use, must be zero. */
|
||||
uint8_t card[8]; /* Cached cardinality, little endian. */
|
||||
uint8_t registers[]; /* Data bytes. */
|
||||
};
|
||||
|
||||
robj *createHLLObject(void);
|
||||
int hllAdd(robj *o, unsigned char *ele, size_t elesize);
|
||||
uint64_t hllCount(struct hllhdr *hdr, int *invalid);
|
||||
|
||||
#endif /* HYPERLOGLOG_H */
|
||||
67
src/t_set.c
67
src/t_set.c
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
#include "server.h"
|
||||
#include "intset.h" /* Compact integer set structure */
|
||||
#include "hyperloglog.h"
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
* Set Commands
|
||||
|
|
@ -1840,6 +1841,8 @@ void sunionCommand(client *c) {
|
|||
sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,SET_OP_UNION);
|
||||
}
|
||||
|
||||
#define HLL_APPROX_CHECK_INTERVAL 1024
|
||||
|
||||
/* SUNIONCARD numkeys key [key ...] [APPROX] [LIMIT limit] */
|
||||
void sunioncardCommand(client *c) {
|
||||
long j;
|
||||
|
|
@ -1879,11 +1882,6 @@ void sunioncardCommand(client *c) {
|
|||
return;
|
||||
}
|
||||
|
||||
if (approx) {
|
||||
/* TODO: Implement HLL-based approximate union cardinality.
|
||||
* For now, fall through to exact mode. */
|
||||
}
|
||||
|
||||
setopsrc *sets = zmalloc(sizeof(setopsrc) * numkeys);
|
||||
for (j = 0; j < numkeys; j++) {
|
||||
kvobj *setobj = lookupKeyRead(c->db, c->argv[2 + j]);
|
||||
|
|
@ -1901,6 +1899,65 @@ void sunioncardCommand(client *c) {
|
|||
sets[j].oldsize = kvobjAllocSize(setobj);
|
||||
}
|
||||
|
||||
if (approx) {
|
||||
/* HLL-based approximate cardinality: use a temporary HLL object
|
||||
* with the standard sparse→dense encoding (same as PFADD). */
|
||||
robj *hllobj = createHLLObject();
|
||||
|
||||
setTypeIterator si;
|
||||
char *str;
|
||||
size_t len = 0;
|
||||
int64_t llval = 0;
|
||||
int encoding;
|
||||
long elements_processed = 0;
|
||||
int early_exit = 0;
|
||||
|
||||
for (j = 0; j < numkeys && !early_exit; j++) {
|
||||
if (!sets[j].set) continue;
|
||||
|
||||
setTypeInitIterator(&si, sets[j].set);
|
||||
while ((encoding = setTypeNext(&si, &str, &len, &llval)) != -1) {
|
||||
if (str != NULL) {
|
||||
hllAdd(hllobj, (unsigned char *)str, len);
|
||||
} else {
|
||||
char buf[LONG_STR_SIZE];
|
||||
size_t slen = ll2string(buf, sizeof(buf), (long long)llval);
|
||||
hllAdd(hllobj, (unsigned char *)buf, slen);
|
||||
}
|
||||
|
||||
elements_processed++;
|
||||
if (have_limit &&
|
||||
(elements_processed % HLL_APPROX_CHECK_INTERVAL == 0)) {
|
||||
uint64_t est = hllCount(hllobj->ptr, NULL);
|
||||
if (est >= (uint64_t)limit) {
|
||||
early_exit = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
setTypeResetIterator(&si);
|
||||
}
|
||||
|
||||
uint64_t cardinality = hllCount(hllobj->ptr, NULL);
|
||||
if (have_limit && cardinality > (uint64_t)limit)
|
||||
cardinality = (uint64_t)limit;
|
||||
|
||||
if (server.memory_tracking_enabled) {
|
||||
for (j = 0; j < numkeys; j++) {
|
||||
robj *obj = sets[j].set;
|
||||
if (!obj) continue;
|
||||
updateSlotAllocSize(c->db, getKeySlot(c->argv[2 + j]->ptr), obj,
|
||||
sets[j].oldsize, kvobjAllocSize(obj));
|
||||
}
|
||||
}
|
||||
|
||||
addReplyLongLong(c, (long long)cardinality);
|
||||
decrRefCount(hllobj);
|
||||
zfree(sets);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Exact cardinality: build a temporary union set. */
|
||||
int dstset_encoding = OBJ_ENCODING_INTSET;
|
||||
for (j = 0; j < numkeys; j++) {
|
||||
if (!sets[j].set) continue;
|
||||
|
|
|
|||
|
|
@ -321,6 +321,28 @@ foreach type {single multiple single_multiple} {
|
|||
assert_equal 0 [r sunioncard 2 set1{t} set2{t} APPROX LIMIT 0]
|
||||
}
|
||||
|
||||
test "SUNIONCARD APPROX with large sets is within HLL error margin" {
|
||||
r del bigset1{t} bigset2{t}
|
||||
set n 10000
|
||||
for {set i 0} {$i < $n} {incr i} {
|
||||
r sadd bigset1{t} "elem_a_$i"
|
||||
}
|
||||
for {set i 5000} {$i < [expr {$n + 5000}]} {incr i} {
|
||||
r sadd bigset2{t} "elem_b_$i"
|
||||
}
|
||||
|
||||
set exact [r sunioncard 2 bigset1{t} bigset2{t}]
|
||||
set approx_val [r sunioncard 2 bigset1{t} bigset2{t} APPROX]
|
||||
|
||||
set error_pct [expr {abs($approx_val - $exact) * 100.0 / $exact}]
|
||||
assert {$error_pct < 5.0}
|
||||
|
||||
set approx_limited [r sunioncard 2 bigset1{t} bigset2{t} APPROX LIMIT 5000]
|
||||
assert_equal 5000 $approx_limited
|
||||
|
||||
r del bigset1{t} bigset2{t}
|
||||
}
|
||||
|
||||
foreach {type} {regular intset} {
|
||||
# Create sets setN{t} where N = 1..5
|
||||
if {$type eq "regular"} {
|
||||
|
|
|
|||
Loading…
Reference in a new issue