From 4c2af9fb96d1d0a8aee7f4295875119b56bf4c96 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 23:59:41 +0100 Subject: [PATCH 1/4] renamed hashindex_stress -> hashindex_pytest --- src/borg/testsuite/{hashindex_stress.py => hashindex_pytest.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/borg/testsuite/{hashindex_stress.py => hashindex_pytest.py} (100%) diff --git a/src/borg/testsuite/hashindex_stress.py b/src/borg/testsuite/hashindex_pytest.py similarity index 100% rename from src/borg/testsuite/hashindex_stress.py rename to src/borg/testsuite/hashindex_pytest.py From e556233e30e509f32b5805f6340dca4de6300cf4 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 9 Feb 2023 00:00:36 +0100 Subject: [PATCH 2/4] hashindex_pytest: add a comment --- src/borg/testsuite/hashindex_pytest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/borg/testsuite/hashindex_pytest.py b/src/borg/testsuite/hashindex_pytest.py index 5cf8b75e1..010aaec6e 100644 --- a/src/borg/testsuite/hashindex_pytest.py +++ b/src/borg/testsuite/hashindex_pytest.py @@ -1,3 +1,5 @@ +# more hashindex tests. kept separate so we can use pytest here. + import os import random From 1ed8ac0408e100e33f6b94249f67f4d08b0e6b95 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 9 Feb 2023 00:12:29 +0100 Subject: [PATCH 3/4] hashindex_pytest: move hashtable create into separate function --- src/borg/testsuite/hashindex_pytest.py | 27 ++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/borg/testsuite/hashindex_pytest.py b/src/borg/testsuite/hashindex_pytest.py index 010aaec6e..90134bf67 100644 --- a/src/borg/testsuite/hashindex_pytest.py +++ b/src/borg/testsuite/hashindex_pytest.py @@ -8,21 +8,12 @@ import pytest from ..hashindex import NSIndex -@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1") -def test_hashindex_stress(): - """checks if the hashtable behaves as expected - - This can be used in _hashindex.c before running this test to provoke more collisions (don't forget to compile): - #define HASH_MAX_LOAD .99 - #define HASH_MAX_EFF_LOAD .999 - """ - ENTRIES = 10000 - LOOPS = 1000 +def make_hashtables(*, entries, loops): idx = NSIndex() kv = {} - for i in range(LOOPS): + for i in range(loops): # put some entries - for j in range(ENTRIES): + for j in range(entries): k = random.randbytes(32) v = random.randint(0, NSIndex.MAX_VALUE - 1) idx[k] = (v, v, v) @@ -37,3 +28,15 @@ def test_hashindex_stress(): assert idx[k] == (v, v, v) # check entry count assert len(kv) == len(idx) + return idx, kv + + +@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1") +def test_hashindex_stress(): + """checks if the hashtable behaves as expected + + This can be used in _hashindex.c before running this test to provoke more collisions (don't forget to compile): + #define HASH_MAX_LOAD .99 + #define HASH_MAX_EFF_LOAD .999 + """ + make_hashtables(entries=10000, loops=1000) # we do quite some assertions while making them From 3d57dc0590daffa8cde021ff8ef9246cb7f4f5fc Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 9 Feb 2023 00:46:06 +0100 Subject: [PATCH 4/4] hashindex: add tests, misc. minor fixes/changes test hashtable expansion/rebuild. hashindex_lookup: - return -2 for a compact / completely full hashtable - return -1 and (via start_idx pointer) the deleted/tombstone bucket index. fix size assertion (we add 1 element to trigger rebuild) fix upper_limit check - since we'll be adding 1 to num_entries below, the condition should be >=: hashindex_compact: set min_empty/upper_limit Co-authored-by: Dan Christensen --- src/borg/_hashindex.c | 21 +++++++++++++-------- src/borg/testsuite/hashindex_pytest.py | 26 ++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 5c4318dac..d5bccc4d5 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -191,11 +191,14 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx) /* When idx == start, we have done a full pass over all buckets. * - We did not find a bucket with the key we searched for. * - We did not find an empty bucket either. - * So all buckets are either full or deleted/tombstones. - * This is an invalid state we never should get into, see - * upper_limit and min_empty. + * - We may have found a deleted/tombstone bucket, though. + * This can easily happen if we have a compact hashtable. */ - assert(idx != start); + if(idx == start) { + if(didx != -1) + break; /* we have found a deleted/tombstone bucket at least */ + return -2; /* HT is completely full, no empty or deleted buckets. */ + } } /* we get here if we did not find a bucket with the key we searched for. */ if (start_idx != NULL) { @@ -745,8 +748,8 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value) uint8_t *ptr; if(idx < 0) { - if(index->num_entries > index->upper_limit) { - /* hashtable too full, grow it! */ + if(index->num_entries >= index->upper_limit || idx == -2) { + /* hashtable too full or even a compact hashtable, grow/rebuild it! */ if(!hashindex_resize(index, grow_size(index->num_buckets))) { return 0; } @@ -754,7 +757,7 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value) * so we only have EMPTY or USED buckets, but no DELETED ones any more. */ idx = hashindex_lookup(index, key, &start_idx); - assert(idx < 0); + assert(idx == -1); assert(BUCKET_IS_EMPTY(index, start_idx)); } idx = start_idx; @@ -768,7 +771,7 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value) * so we only have EMPTY or USED buckets, but no DELETED ones any more. */ idx = hashindex_lookup(index, key, &start_idx); - assert(idx < 0); + assert(idx == -1); assert(BUCKET_IS_EMPTY(index, start_idx)); idx = start_idx; } @@ -879,6 +882,8 @@ hashindex_compact(HashIndex *index) index->num_buckets = index->num_entries; index->num_empty = 0; + index->min_empty = 0; + index->upper_limit = index->num_entries; /* triggers a resize/rebuild when a new entry is added */ return saved_size; } diff --git a/src/borg/testsuite/hashindex_pytest.py b/src/borg/testsuite/hashindex_pytest.py index 90134bf67..4315d1b53 100644 --- a/src/borg/testsuite/hashindex_pytest.py +++ b/src/borg/testsuite/hashindex_pytest.py @@ -40,3 +40,29 @@ def test_hashindex_stress(): #define HASH_MAX_EFF_LOAD .999 """ make_hashtables(entries=10000, loops=1000) # we do quite some assertions while making them + + +def test_hashindex_compact(): + """test that we do not lose or corrupt data by the compaction nor by expanding/rebuilding""" + idx, kv = make_hashtables(entries=5000, loops=5) + size_noncompact = idx.size() + # compact the hashtable (remove empty/tombstone buckets) + saved_space = idx.compact() + # did we actually compact (reduce space usage)? + size_compact = idx.size() + assert saved_space > 0 + assert size_noncompact - size_compact == saved_space + # did we lose anything? + for k, v in kv.items(): + assert k in idx and idx[k] == (v, v, v) + assert len(idx) == len(kv) + # now expand the hashtable again. trigger a resize/rebuild by adding an entry. + k = b"x" * 32 + idx[k] = (0, 0, 0) + kv[k] = 0 + size_rebuilt = idx.size() + assert size_rebuilt > size_compact + 1 + # did we lose anything? + for k, v in kv.items(): + assert k in idx and idx[k] == (v, v, v) + assert len(idx) == len(kv)