diff --git a/borg/_chunker.c b/borg/_chunker.c index 7f772ca4b..2d1a03629 100644 --- a/borg/_chunker.c +++ b/borg/_chunker.c @@ -249,11 +249,12 @@ chunker_process(Chunker *c) PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch"); return NULL; } - while(c->remaining <= window_size && !c->eof) { + while(c->remaining < min_size + window_size + 1 && !c->eof) { /* see assert in Chunker init */ if(!chunker_fill(c)) { return NULL; } } + /* here we either are at eof ... */ if(c->eof) { c->done = 1; if(c->remaining) { @@ -268,8 +269,15 @@ chunker_process(Chunker *c) return NULL; } } + /* ... or we have at least min_size + window_size + 1 bytes remaining. + * We do not want to "cut" a chunk smaller than min_size and the hash + * window starts at the potential cutting place. + */ + c->position += min_size; + c->remaining -= min_size; + n += min_size; sum = buzhash(c->data + c->position, window_size, c->table); - while(c->remaining > c->window_size && ((sum & chunk_mask) || n < min_size)) { + while(c->remaining > c->window_size && (sum & chunk_mask)) { sum = buzhash_update(sum, c->data[c->position], c->data[c->position + window_size], window_size, c->table); diff --git a/borg/chunker.pyx b/borg/chunker.pyx index 0faa06f38..560e14c82 100644 --- a/borg/chunker.pyx +++ b/borg/chunker.pyx @@ -23,6 +23,8 @@ cdef class Chunker: def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size): min_size = 1 << chunk_min_exp max_size = 1 << chunk_max_exp + # see chunker_process, first while loop condition, first term must be able to get True: + assert hash_window_size + min_size + 1 <= max_size, "too small max_size" hash_mask = (1 << hash_mask_bits) - 1 self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff) diff --git a/borg/testsuite/archiver.py b/borg/testsuite/archiver.py index 5b8cf95af..7f4719d74 100644 --- a/borg/testsuite/archiver.py +++ b/borg/testsuite/archiver.py @@ -1491,9 +1491,9 @@ class ArchiverTestCase(ArchiverTestCaseBase): self.cmd('create', self.repository_location + '::test', 'input') archive_before = self.cmd('list', self.repository_location + '::test', '--format', '{sha512}') with patch.object(Cache, 'add_chunk', self._test_recreate_chunker_interrupt_patch()): - self.cmd('recreate', '-pv', '--chunker-params', '10,12,11,4095', self.repository_location) + self.cmd('recreate', '-pv', '--chunker-params', '10,13,11,4095', self.repository_location) assert 'test.recreate' in self.cmd('list', self.repository_location) - output = self.cmd('recreate', '-svp', '--debug', '--chunker-params', '10,12,11,4095', self.repository_location) + output = self.cmd('recreate', '-svp', '--debug', '--chunker-params', '10,13,11,4095', self.repository_location) assert 'Found test.recreate, will resume' in output assert 'Copied 1 chunks from a partially processed item' in output archive_after = self.cmd('list', self.repository_location + '::test', '--format', '{sha512}')