From 4c2af9fb96d1d0a8aee7f4295875119b56bf4c96 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Wed, 8 Feb 2023 23:59:41 +0100
Subject: [PATCH 1/4] renamed hashindex_stress -> hashindex_pytest

---
 src/borg/testsuite/{hashindex_stress.py => hashindex_pytest.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/borg/testsuite/{hashindex_stress.py => hashindex_pytest.py} (100%)

diff --git a/src/borg/testsuite/hashindex_stress.py b/src/borg/testsuite/hashindex_pytest.py
similarity index 100%
rename from src/borg/testsuite/hashindex_stress.py
rename to src/borg/testsuite/hashindex_pytest.py

From e556233e30e509f32b5805f6340dca4de6300cf4 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 9 Feb 2023 00:00:36 +0100
Subject: [PATCH 2/4] hashindex_pytest: add a comment

---
 src/borg/testsuite/hashindex_pytest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/borg/testsuite/hashindex_pytest.py b/src/borg/testsuite/hashindex_pytest.py
index 5cf8b75e1..010aaec6e 100644
--- a/src/borg/testsuite/hashindex_pytest.py
+++ b/src/borg/testsuite/hashindex_pytest.py
@@ -1,3 +1,5 @@
+# more hashindex tests. kept separate so we can use pytest here.
+
 import os
 import random
 

From 1ed8ac0408e100e33f6b94249f67f4d08b0e6b95 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 9 Feb 2023 00:12:29 +0100
Subject: [PATCH 3/4] hashindex_pytest: move hashtable create into separate
 function

---
 src/borg/testsuite/hashindex_pytest.py | 27 ++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/borg/testsuite/hashindex_pytest.py b/src/borg/testsuite/hashindex_pytest.py
index 010aaec6e..90134bf67 100644
--- a/src/borg/testsuite/hashindex_pytest.py
+++ b/src/borg/testsuite/hashindex_pytest.py
@@ -8,21 +8,12 @@ import pytest
 from ..hashindex import NSIndex
 
 
-@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1")
-def test_hashindex_stress():
-    """checks if the hashtable behaves as expected
-
-    This can be used in _hashindex.c before running this test to provoke more collisions (don't forget to compile):
-    #define HASH_MAX_LOAD .99
-    #define HASH_MAX_EFF_LOAD .999
-    """
-    ENTRIES = 10000
-    LOOPS = 1000
+def make_hashtables(*, entries, loops):
     idx = NSIndex()
     kv = {}
-    for i in range(LOOPS):
+    for i in range(loops):
         # put some entries
-        for j in range(ENTRIES):
+        for j in range(entries):
             k = random.randbytes(32)
             v = random.randint(0, NSIndex.MAX_VALUE - 1)
             idx[k] = (v, v, v)
@@ -37,3 +28,15 @@ def test_hashindex_stress():
             assert idx[k] == (v, v, v)
         # check entry count
         assert len(kv) == len(idx)
+    return idx, kv
+
+
+@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1")
+def test_hashindex_stress():
+    """checks if the hashtable behaves as expected
+
+    This can be used in _hashindex.c before running this test to provoke more collisions (don't forget to compile):
+    #define HASH_MAX_LOAD .99
+    #define HASH_MAX_EFF_LOAD .999
+    """
+    make_hashtables(entries=10000, loops=1000)  # we do quite some assertions while making them

From 3d57dc0590daffa8cde021ff8ef9246cb7f4f5fc Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 9 Feb 2023 00:46:06 +0100
Subject: [PATCH 4/4] hashindex: add tests, misc. minor fixes/changes

test hashtable expansion/rebuild.

hashindex_lookup:
- return -2 for a compact / completely full hashtable
- return -1 and (via start_idx pointer) the deleted/tombstone bucket index.

fix size assertion (we add 1 element to trigger rebuild)

fix upper_limit check - since we'll be adding 1 to num_entries below,
the condition should be >=:

hashindex_compact: set min_empty/upper_limit

Co-authored-by: Dan Christensen <jdc+github@uwo.ca>
---
 src/borg/_hashindex.c                  | 21 +++++++++++++--------
 src/borg/testsuite/hashindex_pytest.py | 26 ++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c
index 5c4318dac..d5bccc4d5 100644
--- a/src/borg/_hashindex.c
+++ b/src/borg/_hashindex.c
@@ -191,11 +191,14 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx)
         /* When idx == start, we have done a full pass over all buckets.
          * - We did not find a bucket with the key we searched for.
          * - We did not find an empty bucket either.
-         * So all buckets are either full or deleted/tombstones.
-         * This is an invalid state we never should get into, see
-         * upper_limit and min_empty.
+         * - We may have found a deleted/tombstone bucket, though.
+         * This can easily happen if we have a compact hashtable.
          */
-        assert(idx != start);
+        if(idx == start) {
+            if(didx != -1)
+                break;  /* we have found a deleted/tombstone bucket at least */
+            return -2;  /* HT is completely full, no empty or deleted buckets. */
+        }
     }
     /* we get here if we did not find a bucket with the key we searched for. */
     if (start_idx != NULL) {
@@ -745,8 +748,8 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value)
     uint8_t *ptr;
     if(idx < 0)
     {
-        if(index->num_entries > index->upper_limit) {
-            /* hashtable too full, grow it! */
+        if(index->num_entries >= index->upper_limit || idx == -2) {
+            /* hashtable too full or even a compact hashtable, grow/rebuild it! */
             if(!hashindex_resize(index, grow_size(index->num_buckets))) {
                 return 0;
             }
@@ -754,7 +757,7 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value)
              * so we only have EMPTY or USED buckets, but no DELETED ones any more.
              */
             idx = hashindex_lookup(index, key, &start_idx);
-            assert(idx < 0);
+            assert(idx == -1);
             assert(BUCKET_IS_EMPTY(index, start_idx));
         }
         idx = start_idx;
@@ -768,7 +771,7 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value)
                  * so we only have EMPTY or USED buckets, but no DELETED ones any more.
                  */
                 idx = hashindex_lookup(index, key, &start_idx);
-                assert(idx < 0);
+                assert(idx == -1);
                 assert(BUCKET_IS_EMPTY(index, start_idx));
                 idx = start_idx;
             }
@@ -879,6 +882,8 @@ hashindex_compact(HashIndex *index)
 
     index->num_buckets = index->num_entries;
     index->num_empty = 0;
+    index->min_empty = 0;
+    index->upper_limit = index->num_entries;  /* triggers a resize/rebuild when a new entry is added */
     return saved_size;
 }
 
diff --git a/src/borg/testsuite/hashindex_pytest.py b/src/borg/testsuite/hashindex_pytest.py
index 90134bf67..4315d1b53 100644
--- a/src/borg/testsuite/hashindex_pytest.py
+++ b/src/borg/testsuite/hashindex_pytest.py
@@ -40,3 +40,29 @@ def test_hashindex_stress():
     #define HASH_MAX_EFF_LOAD .999
     """
     make_hashtables(entries=10000, loops=1000)  # we do quite some assertions while making them
+
+
+def test_hashindex_compact():
+    """test that we do not lose or corrupt data by the compaction nor by expanding/rebuilding"""
+    idx, kv = make_hashtables(entries=5000, loops=5)
+    size_noncompact = idx.size()
+    # compact the hashtable (remove empty/tombstone buckets)
+    saved_space = idx.compact()
+    # did we actually compact (reduce space usage)?
+    size_compact = idx.size()
+    assert saved_space > 0
+    assert size_noncompact - size_compact == saved_space
+    # did we lose anything?
+    for k, v in kv.items():
+        assert k in idx and idx[k] == (v, v, v)
+    assert len(idx) == len(kv)
+    # now expand the hashtable again. trigger a resize/rebuild by adding an entry.
+    k = b"x" * 32
+    idx[k] = (0, 0, 0)
+    kv[k] = 0
+    size_rebuilt = idx.size()
+    assert size_rebuilt > size_compact + 1
+    # did we lose anything?
+    for k, v in kv.items():
+        assert k in idx and idx[k] == (v, v, v)
+    assert len(idx) == len(kv)