_hashindex.c: more fixes

- avoid buckets_length integer overflow on 32bit systems via huge num_buckets
- always initialize index-> min_empty and num_empty
- correctly free memory when header validation fails.
  this is a minor issue, because borg will terminate in that case anyway.
- make it possible to lookup in compacted hashtables
- deal safely with empty index: we must use num_buckets = 1 to avoid division
  by zero and sanity check in hashindex_read.
- reinitialize upper/lower limit and min_empty after compact
- fix size_idx / fit_size / grow_size / shrink_size (mind array bounds)
- deal with growing when already at max capacity
- hashindex_resize: replace num_entries assertion, rather return error
- BaseIndex.clear: always stay in valid state
  Do not free the old index before we successfully have allocated a new one.
  This is a minor issue as the Exception raised would terminate borg anyway.
This commit is contained in:
Thomas Waldmann 2026-04-14 01:30:54 +02:00
parent b055b713af
commit cd2f5a0648
No known key found for this signature in database
GPG key ID: 243ACFA951F78E01
3 changed files with 74 additions and 36 deletions

View file

@ -178,14 +178,9 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx)
if (idx >= index->num_buckets) { /* triggers at == already */
idx = 0;
}
/* When idx == start, we have done a full pass over all buckets.
* - We did not find a bucket with the key we searched for.
* - We did not find an empty bucket either.
* So all buckets are either full or deleted/tombstones.
* This is an invalid state we never should get into, see
* upper_limit and min_empty.
*/
assert(idx != start);
if (idx == start) {
break;
}
}
/* we get here if we did not find a bucket with the key we searched for. */
if (start_idx != NULL) {
@ -215,7 +210,12 @@ hashindex_resize(HashIndex *index, int capacity)
return 0;
}
}
assert(index->num_entries == new->num_entries);
if(index->num_entries != new->num_entries) {
/* This detects structurally corrupt indices maliciously supplied with wrong num_entries */
hashindex_free(new);
return 0;
}
hashindex_free_buckets(index);
index->buckets = new->buckets;
@ -251,9 +251,15 @@ int get_min_empty(int num_buckets){
int size_idx(int size){
/* find the smallest hash_sizes index with entry >= size */
int i = NELEMS(hash_sizes) - 1;
int i, max_idx = NELEMS(hash_sizes) - 1;
i = max_idx;
while(i >= 0 && hash_sizes[i] >= size) i--;
return i + 1;
i++;
/* if size is larger than any hash_sizes entry, return the largest one */
if (i > max_idx) {
return max_idx;
}
return i;
}
int fit_size(int current){
@ -262,17 +268,17 @@ int fit_size(int current){
}
int grow_size(int current){
int max_idx = NELEMS(hash_sizes) - 1;
int i = size_idx(current) + 1;
int elems = NELEMS(hash_sizes);
if (i >= elems)
return hash_sizes[elems - 1];
if (i > max_idx)
i = max_idx;
return hash_sizes[i];
}
int shrink_size(int current){
int i = size_idx(current) - 1;
if (i < 0)
return hash_sizes[0];
i = 0;
return hash_sizes[i];
}
@ -377,30 +383,35 @@ hashindex_read(PyObject *file_py, int permit_compact, int expected_key_size, int
if (expected_key_size != -1 && header->key_size != expected_key_size) {
PyErr_Format(PyExc_ValueError, "Expected key size %d, got %d.",
expected_key_size, header->key_size);
goto fail_decref_header;
goto fail_release_header_buffer;
}
if (expected_value_size != -1 && header->value_size != expected_value_size) {
PyErr_Format(PyExc_ValueError, "Expected value size %d, got %d.",
expected_value_size, header->value_size);
goto fail_decref_header;
goto fail_release_header_buffer;
}
// in any case, the key and value sizes must be both >= 4, the code assumes this.
if (header->key_size < 4) {
PyErr_Format(PyExc_ValueError, "Expected key size >= 4, got %d.", header->key_size);
goto fail_decref_header;
goto fail_release_header_buffer;
}
if (header->value_size < 4) {
PyErr_Format(PyExc_ValueError, "Expected value size >= 4, got %d.", header->value_size);
goto fail_decref_header;
goto fail_release_header_buffer;
}
// sanity check for num_buckets and num_entries.
if (header_num_buckets < 1) {
PyErr_Format(PyExc_ValueError, "Expected num_buckets >= 1, got %d.", header_num_buckets);
goto fail_decref_header;
goto fail_release_header_buffer;
}
if ((header_num_entries < 0) || (header_num_entries > header_num_buckets)) {
PyErr_Format(PyExc_ValueError, "Expected 0 <= num_entries <= num_buckets, got %d.", header_num_entries);
goto fail_decref_header;
goto fail_release_header_buffer;
}
if ((Py_ssize_t)header_num_buckets > (PY_SSIZE_T_MAX - (Py_ssize_t)sizeof(HashHeader)) / (header->key_size + header->value_size)) {
PyErr_Format(PyExc_ValueError, "Hash index size overflows Py_ssize_t.");
goto fail_release_header_buffer;
}
buckets_length = (Py_ssize_t)header_num_buckets * (header->key_size + header->value_size);
@ -448,8 +459,9 @@ hashindex_read(PyObject *file_py, int permit_compact, int expected_key_size, int
}
index->buckets = index->buckets_buffer.buf;
index->min_empty = get_min_empty(index->num_buckets);
if(!permit_compact) {
index->min_empty = get_min_empty(index->num_buckets);
index->num_empty = count_empty(index);
if(index->num_empty < index->min_empty) {
@ -459,6 +471,8 @@ hashindex_read(PyObject *file_py, int permit_compact, int expected_key_size, int
goto fail_free_buckets;
}
}
} else {
index->num_empty = index->num_buckets - index->num_entries; /* upper bound */
}
/*
@ -612,16 +626,22 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value)
if(idx < 0)
{
if(index->num_entries > index->upper_limit) {
/* hashtable too full, grow it! */
if(!hashindex_resize(index, grow_size(index->num_buckets))) {
int next_size = grow_size(index->num_buckets);
if(next_size != index->num_buckets) {
/* hashtable too full, grow it! */
if(!hashindex_resize(index, next_size)) {
return 0;
}
/* we have just built a fresh hashtable and removed all tombstones,
* so we only have EMPTY or USED buckets, but no DELETED ones any more.
*/
idx = hashindex_lookup(index, key, &start_idx);
assert(idx < 0);
assert(BUCKET_IS_EMPTY(index, start_idx));
} else if (index->num_entries >= index->num_buckets) {
/* Table is maximally sized and 100% full, cannot insert cleanly. */
return 0;
}
/* we have just built a fresh hashtable and removed all tombstones,
* so we only have EMPTY or USED buckets, but no DELETED ones any more.
*/
idx = hashindex_lookup(index, key, &start_idx);
assert(idx < 0);
assert(BUCKET_IS_EMPTY(index, start_idx));
}
idx = start_idx;
if(BUCKET_IS_EMPTY(index, idx)){
@ -699,13 +719,25 @@ hashindex_compact(HashIndex *index)
int begin_used_idx;
int empty_slot_count, count, buckets_to_copy;
int compact_tail_idx = 0;
uint64_t saved_size = (index->num_buckets - index->num_entries) * (uint64_t)index->bucket_size;
int new_num_buckets = index->num_entries == 0 ? 1 : index->num_entries;
uint64_t saved_size = (index->num_buckets - new_num_buckets) * (uint64_t)index->bucket_size;
if(index->num_buckets - index->num_entries == 0) {
if(index->num_buckets == new_num_buckets) {
/* already compact */
return 0;
}
if (index->num_entries == 0) {
index->num_buckets = 1;
memset(BUCKET_ADDR(index, 0), 0, index->bucket_size);
BUCKET_MARK_EMPTY(index, 0);
index->lower_limit = get_lower_limit(index->num_buckets);
index->upper_limit = get_upper_limit(index->num_buckets);
index->min_empty = get_min_empty(index->num_buckets);
index->num_empty = 1;
return saved_size;
}
while(idx < index->num_buckets) {
/* Phase 1: Find some empty slots */
start_idx = idx;
@ -744,6 +776,10 @@ hashindex_compact(HashIndex *index)
}
index->num_buckets = index->num_entries;
index->lower_limit = get_lower_limit(index->num_buckets);
index->upper_limit = get_upper_limit(index->num_buckets);
index->min_empty = get_min_empty(index->num_buckets);
index->num_empty = 0;
return saved_size;
}

View file

@ -122,10 +122,11 @@ cdef class IndexBase:
hashindex_write(self.index, path)
def clear(self):
hashindex_free(self.index)
self.index = hashindex_init(0, self.key_size, self.value_size)
if not self.index:
new_index = hashindex_init(0, self.key_size, self.value_size)
if not new_index:
raise Exception('hashindex_init failed')
hashindex_free(self.index)
self.index = new_index
def setdefault(self, key, value):
if not key in self:

View file

@ -493,7 +493,8 @@ class HashIndexCompactTestCase(HashIndexDataTestCase):
compact_index = self.index_from_data_compact_to_data()
self.index(num_entries=0, num_buckets=0)
self.index(num_entries=0, num_buckets=1)
self.write_empty(b'\0' * 32)
assert compact_index == self.index_data.getvalue()
def test_merge(self):