diff --git a/attic/_chunker.c b/attic/_chunker.c index 94d4e47ae..f384a56b6 100644 --- a/attic/_chunker.c +++ b/attic/_chunker.c @@ -85,14 +85,14 @@ typedef struct { } Chunker; static Chunker * -chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed) +chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed) { Chunker *c = calloc(sizeof(Chunker), 1); c->window_size = window_size; c->chunk_mask = chunk_mask; c->min_size = min_size; c->table = buzhash_init_table(seed); - c->buf_size = 10 * 1024 * 1024; + c->buf_size = max_size; c->data = malloc(c->buf_size); return c; } diff --git a/attic/archive.py b/attic/archive.py index d78a7fdb3..b637d7f1e 100644 --- a/attic/archive.py +++ b/attic/archive.py @@ -22,9 +22,12 @@ from attic.helpers import Error, uid2user, user2uid, gid2group, group2gid, \ ITEMS_BUFFER = 1024 * 1024 CHUNK_MIN = 1024 +CHUNK_MAX = 10 * 1024 * 1024 WINDOW_SIZE = 0xfff CHUNK_MASK = 0xffff +ZEROS = b'\0' * CHUNK_MAX + utime_supports_fd = os.utime in getattr(os, 'supports_fd', {}) utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {}) has_mtime_ns = sys.version >= '3.3' @@ -71,7 +74,7 @@ class ChunkBuffer: self.packer = msgpack.Packer(unicode_errors='surrogateescape') self.chunks = [] self.key = key - self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed) + self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX,self.key.chunk_seed) def add(self, item): self.buffer.write(self.packer.pack(StableDict(item))) @@ -134,7 +137,7 @@ class Archive: self.pipeline = DownloadPipeline(self.repository, self.key) if create: self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats) - self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed) + self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX, self.key.chunk_seed) if name in manifest.archives: raise self.AlreadyExists(name) self.last_checkpoint = time.time() @@ -269,7 +272,13 @@ class Archive: with open(path, 'wb') as fd: ids = [c[0] for c in item[b'chunks']] for data in self.pipeline.fetch_many(ids, is_preloaded=True): - fd.write(data) + if ZEROS.startswith(data): + # all-zero chunk: create a hole in a sparse file + fd.seek(len(data), 1) + else: + fd.write(data) + pos = fd.tell() + fd.truncate(pos) fd.flush() self.restore_attrs(path, item, fd=fd.fileno()) elif stat.S_ISFIFO(mode): diff --git a/attic/chunker.pyx b/attic/chunker.pyx index 44ec31fc7..10a6adae3 100644 --- a/attic/chunker.pyx +++ b/attic/chunker.pyx @@ -8,7 +8,7 @@ cdef extern from "_chunker.c": ctypedef int uint32_t ctypedef struct _Chunker "Chunker": pass - _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed) + _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed) void chunker_set_fd(_Chunker *chunker, object fd) void chunker_free(_Chunker *chunker) object chunker_process(_Chunker *chunker) @@ -20,8 +20,8 @@ cdef extern from "_chunker.c": cdef class Chunker: cdef _Chunker *chunker - def __cinit__(self, window_size, chunk_mask, min_size, seed): - self.chunker = chunker_init(window_size, chunk_mask, min_size, seed & 0xffffffff) + def __cinit__(self, window_size, chunk_mask, min_size, max_size, seed): + self.chunker = chunker_init(window_size, chunk_mask, min_size, max_size, seed & 0xffffffff) def chunkify(self, fd): chunker_set_fd(self.chunker, fd) @@ -52,4 +52,4 @@ def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t table = buzhash_init_table(seed & 0xffffffff) sum = c_buzhash_update(sum, remove, add, len, table) free(table) - return sum \ No newline at end of file + return sum diff --git a/attic/testsuite/archiver.py b/attic/testsuite/archiver.py index c115b460f..b9743fd58 100644 --- a/attic/testsuite/archiver.py +++ b/attic/testsuite/archiver.py @@ -11,7 +11,7 @@ import time import unittest from hashlib import sha256 from attic import xattr -from attic.archive import Archive, ChunkBuffer +from attic.archive import Archive, ChunkBuffer, CHUNK_MAX from attic.archiver import Archiver from attic.cache import Cache from attic.crypto import bytes_to_long, num_aes_blocks @@ -197,6 +197,38 @@ class ArchiverTestCase(ArchiverTestCaseBase): config.write(fd) return Repository(self.repository_path).id + def test_sparse_file(self): + filename = os.path.join(self.input_path, 'sparse') + content = b'foobar' + hole_size = 5 * CHUNK_MAX # 5 full chunker buffers + with open(filename, 'wb') as fd: + # create a file that has a hole at the beginning and end + fd.seek(hole_size, 1) + fd.write(content) + fd.seek(hole_size, 1) + pos = fd.tell() + fd.truncate(pos) + total_len = hole_size + len(content) + hole_size + st = os.stat(filename) + self.assert_equal(st.st_size, total_len) + if hasattr(st, 'st_blocks'): + self.assert_true(st.st_blocks * 512 < total_len / 10) # is input sparse? + self.attic('init', self.repository_location) + self.attic('create', self.repository_location + '::test', 'input') + with changedir('output'): + self.attic('extract', self.repository_location + '::test') + self.assert_dirs_equal('input', 'output/input') + filename = os.path.join(self.output_path, 'input', 'sparse') + with open(filename, 'rb') as fd: + # check if file contents are as expected + self.assert_equal(fd.read(hole_size), b'\0' * hole_size) + self.assert_equal(fd.read(len(content)), content) + self.assert_equal(fd.read(hole_size), b'\0' * hole_size) + st = os.stat(filename) + self.assert_equal(st.st_size, total_len) + if hasattr(st, 'st_blocks'): + self.assert_true(st.st_blocks * 512 < total_len / 10) # is output sparse? + def test_repository_swap_detection(self): self.create_test_files() os.environ['ATTIC_PASSPHRASE'] = 'passphrase' diff --git a/attic/testsuite/chunker.py b/attic/testsuite/chunker.py index 2e666265a..90c4a8c50 100644 --- a/attic/testsuite/chunker.py +++ b/attic/testsuite/chunker.py @@ -1,25 +1,26 @@ from attic.chunker import Chunker, buzhash, buzhash_update from attic.testsuite import AtticTestCase +from attic.archive import CHUNK_MAX from io import BytesIO class ChunkerTestCase(AtticTestCase): def test_chunkify(self): - data = b'0' * 1024 * 1024 * 15 + b'Y' - parts = [bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(data))] + data = b'0' * int(1.5 * CHUNK_MAX) + b'Y' + parts = [bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(data))] self.assert_equal(len(parts), 2) self.assert_equal(b''.join(parts), data) - self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b''))], []) - self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz']) - self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz']) - self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz']) - self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3]) - self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz']) - self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz']) - self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3]) - self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz']) - self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz']) + self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b''))], []) + self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz']) + self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz']) + self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz']) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3]) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz']) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz']) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3]) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz']) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz']) def test_buzhash(self): self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)