borgbackup/borg/testsuite/hashindex.py
Thomas Waldmann 610300c1ce misc. hash table tuning
BUCKET_UPPER_LIMIT: 90% load degrades hash table performance severely,
so I lowered that to 75% (which is a usual value - java uses 75%, python uses 66%).
I chose the higher value of both because we also should not consume too much
memory, considering the RAM usage already is rather high.

MIN_BUCKETS: I can't explain why, but benchmarks showed that choosing 2^N as
table size severely degrades performance (by 3 orders of magnitude!). So a prime
start value improves this a lot, even if we later still use the grow-by-2x algorithm.

hashindex_resize: removed the hashindex_get() call as we already know that the values
come at key + key_size address.

hashindex_init: do not calloc X*Y elements of size 1, but rather X elements of size Y.
Makes the code simpler, not sure if it affects performance.

The tests needed fixing as the resulting hashtable blob is now of course different due
to the above changes, so its sha hash changed.
2015-12-01 21:18:58 +01:00

102 lines
3.6 KiB
Python

import hashlib
import os
import tempfile
from ..hashindex import NSIndex, ChunkIndex
from . import BaseTestCase
def H(x):
# make some 32byte long thing that depends on x
return bytes('%-0.32d' % x, 'ascii')
class HashIndexTestCase(BaseTestCase):
def _generic_test(self, cls, make_value, sha):
idx = cls()
self.assert_equal(len(idx), 0)
# Test set
for x in range(100):
idx[bytes('%-32d' % x, 'ascii')] = make_value(x)
self.assert_equal(len(idx), 100)
for x in range(100):
self.assert_equal(idx[bytes('%-32d' % x, 'ascii')], make_value(x))
# Test update
for x in range(100):
idx[bytes('%-32d' % x, 'ascii')] = make_value(x * 2)
self.assert_equal(len(idx), 100)
for x in range(100):
self.assert_equal(idx[bytes('%-32d' % x, 'ascii')], make_value(x * 2))
# Test delete
for x in range(50):
del idx[bytes('%-32d' % x, 'ascii')]
self.assert_equal(len(idx), 50)
idx_name = tempfile.NamedTemporaryFile()
idx.write(idx_name.name)
del idx
# Verify file contents
with open(idx_name.name, 'rb') as fd:
self.assert_equal(hashlib.sha256(fd.read()).hexdigest(), sha)
# Make sure we can open the file
idx = cls.read(idx_name.name)
self.assert_equal(len(idx), 50)
for x in range(50, 100):
self.assert_equal(idx[bytes('%-32d' % x, 'ascii')], make_value(x * 2))
idx.clear()
self.assert_equal(len(idx), 0)
idx.write(idx_name.name)
del idx
self.assert_equal(len(cls.read(idx_name.name)), 0)
def test_nsindex(self):
self._generic_test(NSIndex, lambda x: (x, x),
'80fba5b40f8cf12f1486f1ba33c9d852fb2b41a5b5961d3b9d1228cf2aa9c4c9')
def test_chunkindex(self):
self._generic_test(ChunkIndex, lambda x: (x, x, x),
'1d71865e72e3c3af18d3c7216b6fa7b014695eaa3ed7f14cf9cd02fba75d1c95')
def test_resize(self):
n = 2000 # Must be >= MIN_BUCKETS
idx_name = tempfile.NamedTemporaryFile()
idx = NSIndex()
idx.write(idx_name.name)
initial_size = os.path.getsize(idx_name.name)
self.assert_equal(len(idx), 0)
for x in range(n):
idx[bytes('%-32d' % x, 'ascii')] = x, x
idx.write(idx_name.name)
self.assert_true(initial_size < os.path.getsize(idx_name.name))
for x in range(n):
del idx[bytes('%-32d' % x, 'ascii')]
self.assert_equal(len(idx), 0)
idx.write(idx_name.name)
self.assert_equal(initial_size, os.path.getsize(idx_name.name))
def test_iteritems(self):
idx = NSIndex()
for x in range(100):
idx[bytes('%-0.32d' % x, 'ascii')] = x, x
all = list(idx.iteritems())
self.assert_equal(len(all), 100)
second_half = list(idx.iteritems(marker=all[49][0]))
self.assert_equal(len(second_half), 50)
self.assert_equal(second_half, all[50:])
def test_chunkindex_merge(self):
idx1 = ChunkIndex()
idx1[H(1)] = 1, 100, 100
idx1[H(2)] = 2, 200, 200
idx1[H(3)] = 3, 300, 300
# no H(4) entry
idx2 = ChunkIndex()
idx2[H(1)] = 4, 100, 100
idx2[H(2)] = 5, 200, 200
# no H(3) entry
idx2[H(4)] = 6, 400, 400
idx1.merge(idx2)
assert idx1[H(1)] == (5, 100, 100)
assert idx1[H(2)] == (7, 200, 200)
assert idx1[H(3)] == (3, 300, 300)
assert idx1[H(4)] == (6, 400, 400)