_hashindex.c: more fixes

- avoid buckets_length integer overflow on 32bit systems via huge num_buckets - always initialize index-> min_empty and num_empty - correctly free memory when header validation fails. this is a minor issue, because borg will terminate in that case anyway. - make it possible to lookup in compacted hashtables - deal safely with empty index: we must use num_buckets = 1 to avoid division by zero and sanity check in hashindex_read. - reinitialize upper/lower limit and min_empty after compact - fix size_idx / fit_size / grow_size / shrink_size (mind array bounds) - deal with growing when already at max capacity - hashindex_resize: replace num_entries assertion, rather return error - BaseIndex.clear: always stay in valid state Do not free the old index before we successfully have allocated a new one. This is a minor issue as the Exception raised would terminate borg anyway.
2026-06-10 09:21:44 -04:00 · 2026-04-14 01:30:54 +02:00 · 2026-04-14 01:30:54 +02:00 · cd2f5a0648
commit cd2f5a0648
parent b055b713af
3 changed files with 74 additions and 36 deletions
--- a/src/borg/_hashindex.c
+++ b/src/borg/_hashindex.c
@ -178,14 +178,9 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx)
        if (idx >= index->num_buckets) {  /* triggers at == already */
            idx = 0;
        }
-        /* When idx == start, we have done a full pass over all buckets.
-         * - We did not find a bucket with the key we searched for.
-         * - We did not find an empty bucket either.
-         * So all buckets are either full or deleted/tombstones.
-         * This is an invalid state we never should get into, see
-         * upper_limit and min_empty.
-         */
-        assert(idx != start);
+        if (idx == start) {
+            break;
+        }
    }
    /* we get here if we did not find a bucket with the key we searched for. */
    if (start_idx != NULL) {
@ -215,7 +210,12 @@ hashindex_resize(HashIndex *index, int capacity)
            return 0;
        }
    }
-    assert(index->num_entries == new->num_entries);
+
+    if(index->num_entries != new->num_entries) {
+        /* This detects structurally corrupt indices maliciously supplied with wrong num_entries */
+        hashindex_free(new);
+        return 0;
+    }

    hashindex_free_buckets(index);
    index->buckets = new->buckets;
@ -251,9 +251,15 @@ int get_min_empty(int num_buckets){

 int size_idx(int size){
    /* find the smallest hash_sizes index with entry >= size */
-    int i = NELEMS(hash_sizes) - 1;
+    int i, max_idx = NELEMS(hash_sizes) - 1;
+    i = max_idx;
    while(i >= 0 && hash_sizes[i] >= size) i--;
-    return i + 1;
+    i++;
+    /* if size is larger than any hash_sizes entry, return the largest one */
+    if (i > max_idx) {
+        return max_idx;
+    }
+    return i;
 }

 int fit_size(int current){
@ -262,17 +268,17 @@ int fit_size(int current){
 }

 int grow_size(int current){
+    int max_idx = NELEMS(hash_sizes) - 1;
    int i = size_idx(current) + 1;
-    int elems = NELEMS(hash_sizes);
-    if (i >= elems)
-        return hash_sizes[elems - 1];
+    if (i > max_idx)
+        i = max_idx;
    return hash_sizes[i];
 }

 int shrink_size(int current){
    int i = size_idx(current) - 1;
    if (i < 0)
-        return hash_sizes[0];
+        i = 0;
    return hash_sizes[i];
 }

@ -377,30 +383,35 @@ hashindex_read(PyObject *file_py, int permit_compact, int expected_key_size, int
    if (expected_key_size != -1 && header->key_size != expected_key_size) {
        PyErr_Format(PyExc_ValueError, "Expected key size %d, got %d.",
                     expected_key_size, header->key_size);
-        goto fail_decref_header;
+        goto fail_release_header_buffer;
    }
    if (expected_value_size != -1 && header->value_size != expected_value_size) {
        PyErr_Format(PyExc_ValueError, "Expected value size %d, got %d.",
                     expected_value_size, header->value_size);
-        goto fail_decref_header;
+        goto fail_release_header_buffer;
    }
    // in any case, the key and value sizes must be both >= 4, the code assumes this.
    if (header->key_size < 4) {
        PyErr_Format(PyExc_ValueError, "Expected key size >= 4, got %d.", header->key_size);
-        goto fail_decref_header;
+        goto fail_release_header_buffer;
    }
    if (header->value_size < 4) {
        PyErr_Format(PyExc_ValueError, "Expected value size >= 4, got %d.", header->value_size);
-        goto fail_decref_header;
+        goto fail_release_header_buffer;
    }
    // sanity check for num_buckets and num_entries.
    if (header_num_buckets < 1) {
        PyErr_Format(PyExc_ValueError, "Expected num_buckets >= 1, got %d.", header_num_buckets);
-        goto fail_decref_header;
+        goto fail_release_header_buffer;
    }
    if ((header_num_entries < 0) || (header_num_entries > header_num_buckets)) {
        PyErr_Format(PyExc_ValueError, "Expected 0 <= num_entries <= num_buckets, got %d.", header_num_entries);
-        goto fail_decref_header;
+        goto fail_release_header_buffer;
+    }
+
+    if ((Py_ssize_t)header_num_buckets > (PY_SSIZE_T_MAX - (Py_ssize_t)sizeof(HashHeader)) / (header->key_size + header->value_size)) {
+        PyErr_Format(PyExc_ValueError, "Hash index size overflows Py_ssize_t.");
+        goto fail_release_header_buffer;
    }

    buckets_length = (Py_ssize_t)header_num_buckets * (header->key_size + header->value_size);
@ -448,8 +459,9 @@ hashindex_read(PyObject *file_py, int permit_compact, int expected_key_size, int
    }
    index->buckets = index->buckets_buffer.buf;

+    index->min_empty = get_min_empty(index->num_buckets);
+
    if(!permit_compact) {
-        index->min_empty = get_min_empty(index->num_buckets);
        index->num_empty = count_empty(index);

        if(index->num_empty < index->min_empty) {
@ -459,6 +471,8 @@ hashindex_read(PyObject *file_py, int permit_compact, int expected_key_size, int
                goto fail_free_buckets;
            }
        }
+    } else {
+        index->num_empty = index->num_buckets - index->num_entries; /* upper bound */
    }

    /*
@ -612,16 +626,22 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value)
    if(idx < 0)
    {
        if(index->num_entries > index->upper_limit) {
-            /* hashtable too full, grow it! */
-            if(!hashindex_resize(index, grow_size(index->num_buckets))) {
+            int next_size = grow_size(index->num_buckets);
+            if(next_size != index->num_buckets) {
+                /* hashtable too full, grow it! */
+                if(!hashindex_resize(index, next_size)) {
+                    return 0;
+                }
+                /* we have just built a fresh hashtable and removed all tombstones,
+                 * so we only have EMPTY or USED buckets, but no DELETED ones any more.
+                 */
+                idx = hashindex_lookup(index, key, &start_idx);
+                assert(idx < 0);
+                assert(BUCKET_IS_EMPTY(index, start_idx));
+            } else if (index->num_entries >= index->num_buckets) {
+                /* Table is maximally sized and 100% full, cannot insert cleanly. */
                return 0;
            }
-            /* we have just built a fresh hashtable and removed all tombstones,
-             * so we only have EMPTY or USED buckets, but no DELETED ones any more.
-             */
-            idx = hashindex_lookup(index, key, &start_idx);
-            assert(idx < 0);
-            assert(BUCKET_IS_EMPTY(index, start_idx));
        }
        idx = start_idx;
        if(BUCKET_IS_EMPTY(index, idx)){
@ -699,13 +719,25 @@ hashindex_compact(HashIndex *index)
    int begin_used_idx;
    int empty_slot_count, count, buckets_to_copy;
    int compact_tail_idx = 0;
-    uint64_t saved_size = (index->num_buckets - index->num_entries) * (uint64_t)index->bucket_size;
+    int new_num_buckets = index->num_entries == 0 ? 1 : index->num_entries;
+    uint64_t saved_size = (index->num_buckets - new_num_buckets) * (uint64_t)index->bucket_size;

-    if(index->num_buckets - index->num_entries == 0) {
+    if(index->num_buckets == new_num_buckets) {
        /* already compact */
        return 0;
    }

+    if (index->num_entries == 0) {
+        index->num_buckets = 1;
+        memset(BUCKET_ADDR(index, 0), 0, index->bucket_size);
+        BUCKET_MARK_EMPTY(index, 0);
+        index->lower_limit = get_lower_limit(index->num_buckets);
+        index->upper_limit = get_upper_limit(index->num_buckets);
+        index->min_empty = get_min_empty(index->num_buckets);
+        index->num_empty = 1;
+        return saved_size;
+    }
+
    while(idx < index->num_buckets) {
        /* Phase 1: Find some empty slots */
        start_idx = idx;
@ -744,6 +776,10 @@ hashindex_compact(HashIndex *index)
    }

    index->num_buckets = index->num_entries;
+    index->lower_limit = get_lower_limit(index->num_buckets);
+    index->upper_limit = get_upper_limit(index->num_buckets);
+    index->min_empty = get_min_empty(index->num_buckets);
+    index->num_empty = 0;
    return saved_size;
 }

--- a/src/borg/hashindex.pyx
+++ b/src/borg/hashindex.pyx
@ -122,10 +122,11 @@ cdef class IndexBase:
            hashindex_write(self.index, path)

    def clear(self):
-        hashindex_free(self.index)
-        self.index = hashindex_init(0, self.key_size, self.value_size)
-        if not self.index:
+        new_index = hashindex_init(0, self.key_size, self.value_size)
+        if not new_index:
            raise Exception('hashindex_init failed')
+        hashindex_free(self.index)
+        self.index = new_index

    def setdefault(self, key, value):
        if not key in self:
--- a/src/borg/testsuite/hashindex.py
+++ b/src/borg/testsuite/hashindex.py
@ -493,7 +493,8 @@ class HashIndexCompactTestCase(HashIndexDataTestCase):

        compact_index = self.index_from_data_compact_to_data()

-        self.index(num_entries=0, num_buckets=0)
+        self.index(num_entries=0, num_buckets=1)
+        self.write_empty(b'\0' * 32)
        assert compact_index == self.index_data.getvalue()

    def test_merge(self):