From 330315ba0d37c91a7bf7e5cabd7905a0b93159bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Borgstr=C3=B6m?= Date: Thu, 4 Nov 2010 21:19:01 +0100 Subject: [PATCH] Chunkifier improvements. Including tin foil hat compliant seeding. --- darc/_speedups.c | 44 +++++++++------- darc/archive.py | 8 +-- darc/chunkifier.py | 122 --------------------------------------------- darc/keychain.py | 3 ++ 4 files changed, 34 insertions(+), 143 deletions(-) delete mode 100644 darc/chunkifier.py diff --git a/darc/_speedups.c b/darc/_speedups.c index 657eaf97d..b3bd9bf6d 100644 --- a/darc/_speedups.c +++ b/darc/_speedups.c @@ -1,6 +1,9 @@ #include #include +#define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) +#define ABS(X) ((X) < 0 ? (-(X)) : (X)) + static unsigned long int checksum(const unsigned char *data, int len, unsigned long int sum) { @@ -28,7 +31,7 @@ roll_checksum(unsigned long int sum, unsigned char remove, unsigned char add, in typedef struct { PyObject_HEAD - int chunk_size, window_size, i, last, eof, done, buf_size, data_len, initial; + int chunk_size, window_size, i, last, eof, done, buf_size, data_len, seed; PyObject *chunks, *fd; unsigned long int sum; unsigned char *data, add, remove; @@ -43,8 +46,7 @@ ChunkifyIter_iter(PyObject *self) c->eof = 0; c->i = 0; c->sum = 0; - c->last = -1; - c->initial = c->window_size; + c->last = 0; Py_INCREF(self); return self; } @@ -62,6 +64,8 @@ static PyObject* ChunkifyIter_iternext(PyObject *self) { ChunkifyIter *c = (ChunkifyIter *)self; + int initial = c->window_size; + if(c->done) { PyErr_SetNone(PyExc_StopIteration); @@ -72,6 +76,7 @@ ChunkifyIter_iternext(PyObject *self) if(c->i == c->buf_size) { int diff = c->last + 1 - c->window_size; + assert(diff >= 0); memmove(c->data, c->data + diff, c->buf_size - diff); c->i -= diff; c->last -= diff; @@ -90,18 +95,20 @@ ChunkifyIter_iternext(PyObject *self) } if(c->i == c->data_len) { - if(c->last < c->i - 1) { + if(c->last < c->i) { c->done = 1; - return PyString_FromStringAndSize((char *)(c->data + c->last + 1), - c->data_len - c->last - 1); + return PyString_FromStringAndSize((char *)(c->data + c->last), + c->data_len - c->last); } PyErr_SetNone(PyExc_StopIteration); return NULL; } - if(c->initial) + if(initial) { - c->initial--; - c->sum = checksum(c->data + c->i, 1, c->sum); + int bytes = MIN(initial, c->data_len - c->i); + initial -= bytes; + c->sum = checksum(c->data + c->i, bytes, 0); + c->i += bytes; } else { @@ -109,20 +116,20 @@ ChunkifyIter_iternext(PyObject *self) c->data[c->i - c->window_size], c->data[c->i], c->window_size); + c->i++; } - c->i++; - if(c->i == c->buf_size && c->last == -1) + if((c->sum % c->chunk_size) == c->seed) { int old_last = c->last; - c->last = c->i - 1; - return PyString_FromStringAndSize((char *)(c->data + old_last + 1), + c->last = c->i; + return PyString_FromStringAndSize((char *)(c->data + old_last), c->last - old_last); } - else if((c->sum % c->chunk_size) == 0) + if(c->i == c->buf_size && c->last <= c->window_size) { int old_last = c->last; - c->last = c->i - 1; - return PyString_FromStringAndSize((char *)(c->data + old_last + 1), + c->last = c->i; + return PyString_FromStringAndSize((char *)(c->data + old_last), c->last - old_last); } } @@ -167,10 +174,10 @@ static PyObject * chunkify(PyObject *self, PyObject *args) { PyObject *fd; - long int chunk_size, window_size; + int chunk_size, window_size, seed; ChunkifyIter *c; - if (!PyArg_ParseTuple(args, "Oii", &fd, &chunk_size, &window_size)) + if (!PyArg_ParseTuple(args, "Oiii", &fd, &chunk_size, &window_size, &seed)) { return NULL; } @@ -184,6 +191,7 @@ chunkify(PyObject *self, PyObject *args) c->fd = fd; c->chunk_size = chunk_size; c->window_size = window_size; + c->seed = seed % chunk_size; Py_INCREF(fd); return (PyObject *)c; } diff --git a/darc/archive.py b/darc/archive.py index dd7c1add6..5877ee573 100644 --- a/darc/archive.py +++ b/darc/archive.py @@ -8,10 +8,11 @@ import sys from xattr import xattr, XATTR_NOFOLLOW from . import NS_ARCHIVE_METADATA, NS_ARCHIVE_ITEMS, NS_ARCHIVE_CHUNKS, NS_CHUNK -from .chunkifier import chunkify +from ._speedups import chunkify from .helpers import uid2user, user2uid, gid2group, group2gid, IntegrityError -CHUNK_SIZE = 55001 +CHUNK_SIZE = 64 * 1024 +WINDOW_SIZE = 4096 have_lchmod = hasattr(os, 'lchmod') linux = sys.platform == 'linux2' @@ -253,7 +254,8 @@ class Archive(object): size = 0 ids = [] chunks = [] - for chunk in chunkify(fd, CHUNK_SIZE, 30): + for chunk in chunkify(fd, CHUNK_SIZE, WINDOW_SIZE, + self.keychain.get_chunkify_seed()): id = self.keychain.id_hash(chunk) ids.append(id) try: diff --git a/darc/chunkifier.py b/darc/chunkifier.py deleted file mode 100644 index 25340a5f1..000000000 --- a/darc/chunkifier.py +++ /dev/null @@ -1,122 +0,0 @@ -def checksum(data, sum=0): - """Simple but fast checksum that can be updated at either end. - - >>> checksum('FOOBAR') - 102367679 - >>> checksum('FOOBAR') == checksum('BAR', checksum('FOO')) - True - """ - s1 = sum & 0xffff - s2 = sum >> 16 - for c in data: - s1 += ord(c) + 1 - s2 += s1 - return ((s2 & 0xffff) << 16) + (s1 & 0xffff) - - -def roll_checksum(sum, remove, add, len): - """ - >>> roll_checksum(checksum('XFOOBA'), 'X', 'R', 6) == checksum('FOOBAR') - True - """ - s1 = sum & 0xffff - s2 = sum >> 16 - add = ord(add) - remove = ord(remove) - s1 -= remove - add - s2 -= len * (remove + 1) - s1 - return (s1 & 0xffff) + ((s2 & 0xffff) << 16) - - -class ChunkifyIter(object): - - def __init__(self, fd, chunk_size, window_size): - self.fd = fd - self.chunk_size = chunk_size - self.window_size = window_size - self.buf_size = self.chunk_size * 10 - - def __iter__(self): - self.data = '' - self.done = False - self.i = 0 - self.sum = 0 - self.last = -1 - self.initial = self.window_size - return self - - def next(self): - if self.done: - raise StopIteration - while True: - if self.i == self.buf_size: - diff = self.last + 1 - self.window_size - if diff < 0: - import ipdb - ipdb.set_trace() - self.data = self.data[diff:] - self.last -= diff - self.i -= diff - if self.i == len(self.data): - self.data += self.fd.read(self.buf_size - len(self.data)) - if self.i == len(self.data): - if self.last < self.i - 1: - self.done = True - return self.data[self.last + 1:] - raise StopIteration - if self.initial: - self.initial -= 1 - self.sum = checksum(self.data[self.i], self.sum) - else: - self.sum = roll_checksum(self.sum, - self.data[self.i - self.window_size], - self.data[self.i], - self.window_size) - self.i += 1 - if self.i == self.buf_size and self.last == -1: - old_last = self.last - self.last = self.i - 1 - return self.data[old_last + 1:self.last + 1] - elif self.sum % self.chunk_size == 0: - old_last = self.last - self.last = self.i - 1 - return self.data[old_last + 1:self.last + 1] - - -def chunkify(fd, chunk_size, chunks): - """ - >>> list(chunkify(StringIO.StringIO(''), 5, 3)) - [] - >>> list(chunkify(StringIO.StringIO('A'), 5, 3)) - ['A'] - >>> list(chunkify(StringIO.StringIO('AB'), 5, 3)) - ['AB'] - >>> list(chunkify(StringIO.StringIO('1B'), 5, 3)) - ['1', 'B'] - >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMNOPQ'), 5, 3)) - ['ABCD', 'EFGHI', 'JKLMN', 'OPQ'] - >>> list(chunkify(StringIO.StringIO('1ABCDEFGHIJKLMNOPQ'), 5, 3)) - ['1', 'ABCD', 'EFGHI', 'JKLMN', 'OPQ'] - >>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQ'), 5, 3)) - ['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQ'] - >>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 5, 3)) - ['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQRS', 'TUVWX', 'YZ'] - >>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 5, 3)) - ['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQRS', 'TUVWX', 'YZ'] - """ - return ChunkifyIter(fd, chunk_size, chunks) - -try: - import _speedups - checksum = _speedups.checksum - roll_checksum = _speedups.roll_checksum - py_chunkify = chunkify - chunkify = _speedups.chunkify -except ImportError: - print 'Failed to load _speedups module, things will be slow' - - -if __name__ == '__main__': - import doctest - import StringIO - doctest.testmod() diff --git a/darc/keychain.py b/darc/keychain.py index be793b93d..fbc604f12 100644 --- a/darc/keychain.py +++ b/darc/keychain.py @@ -30,6 +30,9 @@ class Keychain(object): if path: self.open(path) + def get_chunkify_seed(self): + return bytes_to_long(self.aes_id[:4]) + def open(self, path): print 'Opening keychain "%s"' % path with open(path, 'rb') as fd: