diff --git a/attic/_chunker.c b/attic/_chunker.c index 372fcb326..94d4e47ae 100644 --- a/attic/_chunker.c +++ b/attic/_chunker.c @@ -85,15 +85,22 @@ typedef struct { } Chunker; static Chunker * -chunker_init(PyObject *fd, int window_size, int chunk_mask, int min_size, uint32_t seed) +chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed) { - Chunker *c = malloc(sizeof(Chunker)); + Chunker *c = calloc(sizeof(Chunker), 1); c->window_size = window_size; c->chunk_mask = chunk_mask; c->min_size = min_size; c->table = buzhash_init_table(seed); c->buf_size = 10 * 1024 * 1024; c->data = malloc(c->buf_size); + return c; +} + +static void +chunker_set_fd(Chunker *c, PyObject *fd) +{ + Py_XDECREF(c->fd); c->fd = fd; Py_INCREF(fd); c->done = 0; @@ -103,13 +110,12 @@ chunker_init(PyObject *fd, int window_size, int chunk_mask, int min_size, uint32 c->position = 0; c->last = 0; c->eof = 0; - return c; } static void chunker_free(Chunker *c) { - Py_DECREF(c->fd); + Py_XDECREF(c->fd); free(c->table); free(c->data); free(c); diff --git a/attic/archive.py b/attic/archive.py index a49d39510..171da3fe0 100644 --- a/attic/archive.py +++ b/attic/archive.py @@ -15,7 +15,7 @@ import time from io import BytesIO from attic import xattr from attic.platform import acl_get, acl_set -from attic.chunker import chunkify +from attic.chunker import Chunker from attic.hashindex import ChunkIndex from attic.helpers import Error, uid2user, user2uid, gid2group, group2gid, \ Manifest, Statistics, decode_dict, st_mtime_ns, make_path_safe, StableDict, int_to_bigint, bigint_to_int @@ -65,6 +65,7 @@ class ChunkBuffer: self.packer = msgpack.Packer(unicode_errors='surrogateescape') self.chunks = [] self.key = key + self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed) def add(self, item): self.buffer.write(self.packer.pack(StableDict(item))) @@ -78,7 +79,7 @@ class ChunkBuffer: if self.buffer.tell() == 0: return self.buffer.seek(0) - chunks = list(bytes(s) for s in chunkify(self.buffer, WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed)) + chunks = list(bytes(s) for s in self.chunker.chunkify(self.buffer)) self.buffer.seek(0) self.buffer.truncate(0) # Leave the last parital chunk in the buffer unless flush is True @@ -126,6 +127,7 @@ class Archive: self.numeric_owner = numeric_owner self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats) self.pipeline = DownloadPipeline(self.repository, self.key) + self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed) if create: if name in manifest.archives: raise self.AlreadyExists(name) @@ -399,7 +401,7 @@ class Archive: if chunks is None: with open(path, 'rb') as fd: chunks = [] - for chunk in chunkify(fd, WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed): + for chunk in self.chunker.chunkify(fd): chunks.append(cache.add_chunk(self.key.id_hash(chunk), chunk, self.stats)) cache.memorize_file(path_hash, st, [c[0] for c in chunks]) item = {b'path': safe_path, b'chunks': chunks} diff --git a/attic/chunker.pyx b/attic/chunker.pyx index 6fa208989..44ec31fc7 100644 --- a/attic/chunker.pyx +++ b/attic/chunker.pyx @@ -1,26 +1,31 @@ # -*- coding: utf-8 -*- -API_VERSION = 1 +API_VERSION = 2 from libc.stdlib cimport free cdef extern from "_chunker.c": ctypedef int uint32_t - ctypedef struct Chunker: + ctypedef struct _Chunker "Chunker": pass - Chunker *chunker_init(object fd, int window_size, int chunk_mask, int min_size, uint32_t seed) - void chunker_free(Chunker *chunker) - object chunker_process(Chunker *chunker) + _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed) + void chunker_set_fd(_Chunker *chunker, object fd) + void chunker_free(_Chunker *chunker) + object chunker_process(_Chunker *chunker) uint32_t *buzhash_init_table(uint32_t seed) uint32_t c_buzhash "buzhash"(unsigned char *data, size_t len, uint32_t *h) uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h) -cdef class chunkify: - cdef Chunker *chunker +cdef class Chunker: + cdef _Chunker *chunker - def __cinit__(self, fd, window_size, chunk_mask, min_size, seed): - self.chunker = chunker_init(fd, window_size, chunk_mask, min_size, seed & 0xffffffff) + def __cinit__(self, window_size, chunk_mask, min_size, seed): + self.chunker = chunker_init(window_size, chunk_mask, min_size, seed & 0xffffffff) + + def chunkify(self, fd): + chunker_set_fd(self.chunker, fd) + return self def __dealloc__(self): if self.chunker: diff --git a/attic/helpers.py b/attic/helpers.py index 56ce4263c..d749dbd2a 100644 --- a/attic/helpers.py +++ b/attic/helpers.py @@ -74,7 +74,7 @@ class UpgradableLock: def check_extension_modules(): import attic.platform if (attic.hashindex.API_VERSION != 2 or - attic.chunker.API_VERSION != 1 or + attic.chunker.API_VERSION != 2 or attic.crypto.API_VERSION != 2 or attic.platform.API_VERSION != 2): raise ExtensionModuleError @@ -577,3 +577,4 @@ def int_to_bigint(value): if value.bit_length() > 63: return value.to_bytes((value.bit_length() + 9) // 8, 'little', signed=True) return value + diff --git a/attic/testsuite/chunker.py b/attic/testsuite/chunker.py index 09e9d01f9..2e666265a 100644 --- a/attic/testsuite/chunker.py +++ b/attic/testsuite/chunker.py @@ -1,4 +1,4 @@ -from attic.chunker import chunkify, buzhash, buzhash_update +from attic.chunker import Chunker, buzhash, buzhash_update from attic.testsuite import AtticTestCase from io import BytesIO @@ -7,19 +7,19 @@ class ChunkerTestCase(AtticTestCase): def test_chunkify(self): data = b'0' * 1024 * 1024 * 15 + b'Y' - parts = [bytes(c) for c in chunkify(BytesIO(data), 2, 0x3, 2, 0)] + parts = [bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(data))] self.assert_equal(len(parts), 2) self.assert_equal(b''.join(parts), data) - self.assert_equal([bytes(c) for c in chunkify(BytesIO(b''), 2, 0x3, 2, 0)], []) - self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 0)], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz']) - self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 1)], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz']) - self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 2)], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz']) - self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 0)], [b'foobarboobaz' * 3]) - self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 1)], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz']) - self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 2)], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz']) - self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 0)], [b'foobarboobaz' * 3]) - self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 1)], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz']) - self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 2)], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz']) + self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b''))], []) + self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz']) + self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz']) + self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz']) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3]) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz']) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz']) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3]) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz']) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz']) def test_buzhash(self): self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)