Merge pull request #1077 from ThomasWaldmann/do-not-chunk-small-files

chunker: speed up remainder <= min_size case
This commit is contained in:
enkore 2016-05-24 18:44:44 +02:00
commit 33f3a70cf6
3 changed files with 15 additions and 5 deletions

View file

@ -96,7 +96,7 @@ buzhash(const unsigned char *data, size_t len, const uint32_t *h)
static uint32_t
buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h)
{
uint32_t lenmod = len & 0x1f;
uint32_t lenmod = len & 0x1f; /* Note: replace by constant to get small speedup */
return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], lenmod) ^ h[add];
}
@ -249,11 +249,12 @@ chunker_process(Chunker *c)
PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch");
return NULL;
}
while(c->remaining <= window_size && !c->eof) {
while(c->remaining < min_size + window_size + 1 && !c->eof) { /* see assert in Chunker init */
if(!chunker_fill(c)) {
return NULL;
}
}
/* here we either are at eof ... */
if(c->eof) {
c->done = 1;
if(c->remaining) {
@ -268,8 +269,15 @@ chunker_process(Chunker *c)
return NULL;
}
}
/* ... or we have at least min_size + window_size + 1 bytes remaining.
* We do not want to "cut" a chunk smaller than min_size and the hash
* window starts at the potential cutting place.
*/
c->position += min_size;
c->remaining -= min_size;
n += min_size;
sum = buzhash(c->data + c->position, window_size, c->table);
while(c->remaining > c->window_size && ((sum & chunk_mask) || n < min_size)) {
while(c->remaining > c->window_size && (sum & chunk_mask)) {
sum = buzhash_update(sum, c->data[c->position],
c->data[c->position + window_size],
window_size, c->table);

View file

@ -23,6 +23,8 @@ cdef class Chunker:
def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
min_size = 1 << chunk_min_exp
max_size = 1 << chunk_max_exp
# see chunker_process, first while loop condition, first term must be able to get True:
assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
hash_mask = (1 << hash_mask_bits) - 1
self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)

View file

@ -1515,9 +1515,9 @@ class ArchiverTestCase(ArchiverTestCaseBase):
self.cmd('create', self.repository_location + '::test', 'input')
archive_before = self.cmd('list', self.repository_location + '::test', '--format', '{sha512}')
with patch.object(Cache, 'add_chunk', self._test_recreate_chunker_interrupt_patch()):
self.cmd('recreate', '-pv', '--chunker-params', '10,12,11,4095', self.repository_location)
self.cmd('recreate', '-pv', '--chunker-params', '10,13,11,4095', self.repository_location)
assert 'test.recreate' in self.cmd('list', self.repository_location)
output = self.cmd('recreate', '-svp', '--debug', '--chunker-params', '10,12,11,4095', self.repository_location)
output = self.cmd('recreate', '-svp', '--debug', '--chunker-params', '10,13,11,4095', self.repository_location)
assert 'Found test.recreate, will resume' in output
assert 'Copied 1 chunks from a partially processed item' in output
archive_after = self.cmd('list', self.repository_location + '::test', '--format', '{sha512}')