From 8834f6fdbddb083dc60221fe9a49f437ee161cb5 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 21 May 2016 23:16:18 +0200 Subject: [PATCH] chunker: do not buzhash if not needed, fixes #1021 For small remainders of files (last chunk), we do not need to buzhash if it is already clear that there is not enough left (we want at least min_size big chunks). Small files are handled by same code - as they only give 1 chunk, that is the last chunk (see above). See "Cases" considerations below. For big files, we do not need to buzhash the first min_size bytes of a chunk - we do not want to cut there anyway, so we can start buzhashing at offset min_size. Cases (before this change) -------------------------- - A) remaining <= window_size - would do 2 chunker_fill calls (both line 253) and trigger eof with the 2nd call - no buzhashing - result is 1 length chunk - B) window_size < remaining <= min_size: - the chunker would do 1 chunker_fill call (line 253) that would read the entire remaining file (but not trigger eof yet) - would compute all possible remaining - window_size + 1 buzhashes, but without a chance for a cut, because there is also the n < min_size condition - would do another chunker_fill call (line 282), but not get more data, so loop ends - result is 1 length chunk - C) file > min_size: - normal chunking Cases (after this change) ------------------------- - A) similar to above A), but up to remaining < min_size + window_size + 1, so it does not buzhash if there is no chance for a cut. - B) see C) above --- borg/_chunker.c | 12 ++++++++++-- borg/chunker.pyx | 2 ++ borg/testsuite/archiver.py | 4 ++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/borg/_chunker.c b/borg/_chunker.c index 7f772ca4b..2d1a03629 100644 --- a/borg/_chunker.c +++ b/borg/_chunker.c @@ -249,11 +249,12 @@ chunker_process(Chunker *c) PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch"); return NULL; } - while(c->remaining <= window_size && !c->eof) { + while(c->remaining < min_size + window_size + 1 && !c->eof) { /* see assert in Chunker init */ if(!chunker_fill(c)) { return NULL; } } + /* here we either are at eof ... */ if(c->eof) { c->done = 1; if(c->remaining) { @@ -268,8 +269,15 @@ chunker_process(Chunker *c) return NULL; } } + /* ... or we have at least min_size + window_size + 1 bytes remaining. + * We do not want to "cut" a chunk smaller than min_size and the hash + * window starts at the potential cutting place. + */ + c->position += min_size; + c->remaining -= min_size; + n += min_size; sum = buzhash(c->data + c->position, window_size, c->table); - while(c->remaining > c->window_size && ((sum & chunk_mask) || n < min_size)) { + while(c->remaining > c->window_size && (sum & chunk_mask)) { sum = buzhash_update(sum, c->data[c->position], c->data[c->position + window_size], window_size, c->table); diff --git a/borg/chunker.pyx b/borg/chunker.pyx index 0faa06f38..560e14c82 100644 --- a/borg/chunker.pyx +++ b/borg/chunker.pyx @@ -23,6 +23,8 @@ cdef class Chunker: def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size): min_size = 1 << chunk_min_exp max_size = 1 << chunk_max_exp + # see chunker_process, first while loop condition, first term must be able to get True: + assert hash_window_size + min_size + 1 <= max_size, "too small max_size" hash_mask = (1 << hash_mask_bits) - 1 self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff) diff --git a/borg/testsuite/archiver.py b/borg/testsuite/archiver.py index 5b8cf95af..7f4719d74 100644 --- a/borg/testsuite/archiver.py +++ b/borg/testsuite/archiver.py @@ -1491,9 +1491,9 @@ class ArchiverTestCase(ArchiverTestCaseBase): self.cmd('create', self.repository_location + '::test', 'input') archive_before = self.cmd('list', self.repository_location + '::test', '--format', '{sha512}') with patch.object(Cache, 'add_chunk', self._test_recreate_chunker_interrupt_patch()): - self.cmd('recreate', '-pv', '--chunker-params', '10,12,11,4095', self.repository_location) + self.cmd('recreate', '-pv', '--chunker-params', '10,13,11,4095', self.repository_location) assert 'test.recreate' in self.cmd('list', self.repository_location) - output = self.cmd('recreate', '-svp', '--debug', '--chunker-params', '10,12,11,4095', self.repository_location) + output = self.cmd('recreate', '-svp', '--debug', '--chunker-params', '10,13,11,4095', self.repository_location) assert 'Found test.recreate, will resume' in output assert 'Copied 1 chunks from a partially processed item' in output archive_after = self.cmd('list', self.repository_location + '::test', '--format', '{sha512}')