Merge pull request #7328 from ThomasWaldmann/hashindex-compact

hashindex minor fixes, tweaks, tests
This commit is contained in:
TW 2023-02-09 22:46:48 +01:00 committed by GitHub
commit 16f4bf5d7b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 81 additions and 45 deletions

View file

@ -191,11 +191,14 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx)
/* When idx == start, we have done a full pass over all buckets.
* - We did not find a bucket with the key we searched for.
* - We did not find an empty bucket either.
* So all buckets are either full or deleted/tombstones.
* This is an invalid state we never should get into, see
* upper_limit and min_empty.
* - We may have found a deleted/tombstone bucket, though.
* This can easily happen if we have a compact hashtable.
*/
assert(idx != start);
if(idx == start) {
if(didx != -1)
break; /* we have found a deleted/tombstone bucket at least */
return -2; /* HT is completely full, no empty or deleted buckets. */
}
}
/* we get here if we did not find a bucket with the key we searched for. */
if (start_idx != NULL) {
@ -745,8 +748,8 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value)
uint8_t *ptr;
if(idx < 0)
{
if(index->num_entries > index->upper_limit) {
/* hashtable too full, grow it! */
if(index->num_entries >= index->upper_limit || idx == -2) {
/* hashtable too full or even a compact hashtable, grow/rebuild it! */
if(!hashindex_resize(index, grow_size(index->num_buckets))) {
return 0;
}
@ -754,7 +757,7 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value)
* so we only have EMPTY or USED buckets, but no DELETED ones any more.
*/
idx = hashindex_lookup(index, key, &start_idx);
assert(idx < 0);
assert(idx == -1);
assert(BUCKET_IS_EMPTY(index, start_idx));
}
idx = start_idx;
@ -768,7 +771,7 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value)
* so we only have EMPTY or USED buckets, but no DELETED ones any more.
*/
idx = hashindex_lookup(index, key, &start_idx);
assert(idx < 0);
assert(idx == -1);
assert(BUCKET_IS_EMPTY(index, start_idx));
idx = start_idx;
}
@ -879,6 +882,8 @@ hashindex_compact(HashIndex *index)
index->num_buckets = index->num_entries;
index->num_empty = 0;
index->min_empty = 0;
index->upper_limit = index->num_entries; /* triggers a resize/rebuild when a new entry is added */
return saved_size;
}

View file

@ -0,0 +1,68 @@
# more hashindex tests. kept separate so we can use pytest here.
import os
import random
import pytest
from ..hashindex import NSIndex
def make_hashtables(*, entries, loops):
idx = NSIndex()
kv = {}
for i in range(loops):
# put some entries
for j in range(entries):
k = random.randbytes(32)
v = random.randint(0, NSIndex.MAX_VALUE - 1)
idx[k] = (v, v, v)
kv[k] = v
# check and delete a random amount of entries
delete_keys = random.sample(list(kv), k=random.randint(0, len(kv)))
for k in delete_keys:
v = kv.pop(k)
assert idx.pop(k) == (v, v, v)
# check if remaining entries are as expected
for k, v in kv.items():
assert idx[k] == (v, v, v)
# check entry count
assert len(kv) == len(idx)
return idx, kv
@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1")
def test_hashindex_stress():
"""checks if the hashtable behaves as expected
This can be used in _hashindex.c before running this test to provoke more collisions (don't forget to compile):
#define HASH_MAX_LOAD .99
#define HASH_MAX_EFF_LOAD .999
"""
make_hashtables(entries=10000, loops=1000) # we do quite some assertions while making them
def test_hashindex_compact():
"""test that we do not lose or corrupt data by the compaction nor by expanding/rebuilding"""
idx, kv = make_hashtables(entries=5000, loops=5)
size_noncompact = idx.size()
# compact the hashtable (remove empty/tombstone buckets)
saved_space = idx.compact()
# did we actually compact (reduce space usage)?
size_compact = idx.size()
assert saved_space > 0
assert size_noncompact - size_compact == saved_space
# did we lose anything?
for k, v in kv.items():
assert k in idx and idx[k] == (v, v, v)
assert len(idx) == len(kv)
# now expand the hashtable again. trigger a resize/rebuild by adding an entry.
k = b"x" * 32
idx[k] = (0, 0, 0)
kv[k] = 0
size_rebuilt = idx.size()
assert size_rebuilt > size_compact + 1
# did we lose anything?
for k, v in kv.items():
assert k in idx and idx[k] == (v, v, v)
assert len(idx) == len(kv)

View file

@ -1,37 +0,0 @@
import os
import random
import pytest
from ..hashindex import NSIndex
@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1")
def test_hashindex_stress():
"""checks if the hashtable behaves as expected
This can be used in _hashindex.c before running this test to provoke more collisions (don't forget to compile):
#define HASH_MAX_LOAD .99
#define HASH_MAX_EFF_LOAD .999
"""
ENTRIES = 10000
LOOPS = 1000
idx = NSIndex()
kv = {}
for i in range(LOOPS):
# put some entries
for j in range(ENTRIES):
k = random.randbytes(32)
v = random.randint(0, NSIndex.MAX_VALUE - 1)
idx[k] = (v, v, v)
kv[k] = v
# check and delete a random amount of entries
delete_keys = random.sample(list(kv), k=random.randint(0, len(kv)))
for k in delete_keys:
v = kv.pop(k)
assert idx.pop(k) == (v, v, v)
# check if remaining entries are as expected
for k, v in kv.items():
assert idx[k] == (v, v, v)
# check entry count
assert len(kv) == len(idx)